This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 4e8148105aa [fix](serde)Fixed the issue that serde may cause be core when reading schema changed text table. (#50105) (#50504) 4e8148105aa is described below commit 4e8148105aad3c812ebdd64620eb6535df8ce953 Author: daidai <changyu...@selectdb.com> AuthorDate: Tue Apr 29 12:54:43 2025 +0800 [fix](serde)Fixed the issue that serde may cause be core when reading schema changed text table. (#50105) (#50504) bp #50105 --- .../data_types/serde/data_type_struct_serde.cpp | 6 + be/src/vec/exec/format/csv/csv_reader.cpp | 6 +- .../data_types/serde/data_type_serde_csv_test.cpp | 232 +++++++++++++++++++++ 3 files changed, 241 insertions(+), 3 deletions(-) diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.cpp b/be/src/vec/data_types/serde/data_type_struct_serde.cpp index d48f42e2227..02e8fb17bf2 100644 --- a/be/src/vec/data_types/serde/data_type_struct_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_struct_serde.cpp @@ -257,6 +257,12 @@ Status DataTypeStructSerDe::deserialize_one_cell_from_hive_text( } } auto& struct_column = static_cast<ColumnStruct&>(column); + + for (auto i = slices.size(); i < struct_column.get_columns().size(); ++i) { + // Hive schema change will cause the number of sub-columns in the file to + // be inconsistent with the number of sub-columns of the column in the table. + slices.emplace_back(options.null_format, options.null_len); + } for (size_t loc = 0; loc < struct_column.get_columns().size(); loc++) { Status st = elem_serdes_ptrs[loc]->deserialize_one_cell_from_hive_text( struct_column.get_column(loc), slices[loc], options, diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 660e25b2b72..5e37e4834dc 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -67,8 +67,6 @@ enum class FileCachePolicy : uint8_t; namespace doris::vectorized { -const static Slice _s_null_slice = Slice("\\N"); - void EncloseCsvTextFieldSplitter::do_split(const Slice& line, std::vector<Slice>* splitted_values) { const char* data = line.data; const auto& column_sep_positions = _text_line_reader_ctx->column_sep_positions(); @@ -656,7 +654,9 @@ Status CsvReader::_fill_dest_columns(const Slice& line, Block* block, int col_idx = _col_idxs[i]; // col idx is out of range, fill with null. const Slice& value = - col_idx < _split_values.size() ? _split_values[col_idx] : _s_null_slice; + col_idx < _split_values.size() + ? _split_values[col_idx] + : Slice {_options.null_format, static_cast<size_t>(_options.null_len)}; Slice slice {value.data, value.size}; IColumn* col_ptr = columns[i]; diff --git a/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp b/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp index 936d495cc92..b3e49fdcf8c 100644 --- a/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp +++ b/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp @@ -19,6 +19,9 @@ #include "olap/types.h" // for TypeInfo #include "olap/wrapper_field.h" #include "vec/columns/column.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_struct.h" #include "vec/common/string_buffer.hpp" #include "vec/core/field.h" #include "vec/data_types/data_type.h" @@ -482,4 +485,233 @@ TEST(CsvSerde, ComplexTypeSerdeCsvTest) { EXPECT_EQ(str, rand_s_d.to_string()); } } + +TEST(CsvSerde, ComplexTypeSerdeSchemaChangedCsvTest) { + { //struct<string, string> => struct<string, string, string> + DataTypeSerDe::FormatOptions formatOptions; + formatOptions.collection_delim = '\002'; + formatOptions.map_key_delim = '\003'; + + string str = "false\002example"; + DataTypes substruct_dataTypes; + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + + DataTypePtr data_type_ptr = + make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes)); + + auto col = data_type_ptr->create_column(); + Slice slice(str.data(), str.size()); + DataTypeSerDeSPtr serde = data_type_ptr->get_serde(); + Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, formatOptions); + EXPECT_EQ(st, Status::OK()); + auto struct_col = static_cast<ColumnStruct&>( + static_cast<ColumnNullable&>(*col.get()).get_nested_column()); + EXPECT_EQ(struct_col.get_column(0).get_data_at(0).to_string(), "false"); + EXPECT_EQ(struct_col.get_column(1).get_data_at(0).to_string(), "example"); + + EXPECT_EQ(struct_col.get_column(0).is_null_at(0), false); + EXPECT_EQ(struct_col.get_column(1).is_null_at(0), false); + EXPECT_EQ(struct_col.get_column(2).is_null_at(0), true); + } + + { // Map<int,String> => array<string> + DataTypeSerDe::FormatOptions formatOptions; + formatOptions.collection_delim = '\002'; + formatOptions.map_key_delim = '\003'; + + string str = "1\003example\0022\003test"; + + DataTypePtr data_type_ptr = make_nullable( + std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>()))); + + auto col = data_type_ptr->create_column(); + Slice slice(str.data(), str.size()); + DataTypeSerDeSPtr serde = data_type_ptr->get_serde(); + Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, formatOptions); + EXPECT_EQ(st, Status::OK()); + auto array_col = static_cast<ColumnArray&>( + static_cast<ColumnNullable&>(*col.get()).get_nested_column()); + + auto string_col = static_cast<ColumnString&>( + static_cast<ColumnNullable&>(array_col.get_data()).get_nested_column()); + EXPECT_EQ(string_col.get_data_at(0).to_string(), "1\003example"); + EXPECT_EQ(string_col.get_data_at(1).to_string(), "2\003test"); + } + + { // null + DataTypeSerDe::FormatOptions formatOptions; + formatOptions.collection_delim = '\002'; + formatOptions.map_key_delim = '\003'; + std::string null_format = "null"; + formatOptions.escape_char = '|'; + formatOptions.null_format = null_format.data(); + formatOptions.null_len = null_format.size(); + + static const string str = "null"; + + DataTypePtr data_type_ptr = make_nullable( + std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>()))); + + auto col = data_type_ptr->create_column(); + Slice slice(str.data(), str.size()); + DataTypeSerDeSPtr serde = data_type_ptr->get_serde(); + Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, formatOptions); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(col->is_null_at(0), 1); + } + + { // \\N + DataTypeSerDe::FormatOptions formatOptions; + formatOptions.collection_delim = '\002'; + formatOptions.map_key_delim = '\003'; + std::string null_format = "null"; + formatOptions.escape_char = '|'; + formatOptions.null_format = null_format.data(); + formatOptions.null_len = null_format.size(); + + static const string str = "\\N"; + DataTypes substruct_dataTypes; + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + + DataTypePtr data_type_ptr = + make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes)); + + auto col = data_type_ptr->create_column(); + Slice slice(str.data(), str.size()); + DataTypeSerDeSPtr serde = data_type_ptr->get_serde(); + Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, formatOptions); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(col->is_null_at(0), 0); + } + + { // \\N + DataTypeSerDe::FormatOptions formatOptions; + formatOptions.collection_delim = '\002'; + formatOptions.map_key_delim = '\003'; + formatOptions.escape_char = '|'; + + static const string str = "\\N"; + DataTypes substruct_dataTypes; + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + + DataTypePtr data_type_ptr = + make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes)); + + auto col = data_type_ptr->create_column(); + Slice slice(str.data(), str.size()); + DataTypeSerDeSPtr serde = data_type_ptr->get_serde(); + Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, formatOptions); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(col->is_null_at(0), 1); + } + + { // random + auto randomControlChar = [&]() { return static_cast<char>(rand() % 7 + 2); }; + + auto randomPrintableChar = []() { return static_cast<char>(rand() % (126 - 32 + 1) + 32); }; + + auto generateMixedString = [&](int n) -> std::string { + std::string result; + for (int i = 0; i < n; ++i) { + if (rand() % 4 == 0) { + result += randomControlChar(); + } else { + result += randomPrintableChar(); + } + } + for (unsigned char c : result) { + printf("\\x%02X ", c); + } + std::cout << std::endl; + + return result; + }; + + std::srand(std::time(nullptr)); + + for (int i = 0; i < 100; i++) { + DataTypeSerDe::FormatOptions formatOptions; + formatOptions.collection_delim = '\002'; + formatOptions.map_key_delim = '\003'; + string str = generateMixedString(rand() % 100 + 10); + +#define TEST_REPLACE \ + auto col = data_type_ptr->create_column(); \ + Slice slice(str.data(), str.size()); \ + DataTypeSerDeSPtr serde = data_type_ptr->get_serde(); \ + Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, formatOptions); \ + EXPECT_EQ(st, Status::OK()); + + { + DataTypes substruct_dataTypes; + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + DataTypePtr data_type_ptr = + make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes)); + + TEST_REPLACE + } + + { + DataTypePtr data_type_ptr = std::make_shared<DataTypeMap>( + make_nullable(std::make_shared<DataTypeInt32>()), + make_nullable(std::make_shared<DataTypeMap>( + make_nullable(std::make_shared<DataTypeString>()), + make_nullable(std::make_shared<DataTypeInt32>())))); + + TEST_REPLACE + } + + { + DataTypes substruct_dataTypes; + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + + DataTypePtr data_type_ptr = make_nullable(std::make_shared<DataTypeMap>( + make_nullable(std::make_shared<DataTypeInt32>()), + make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes)))); + TEST_REPLACE + } + + { + DataTypes substruct_dataTypes; + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>())); + substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeInt32>())); + + DataTypes struct_dataTypes; + struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeInt32>())); + struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeMap>( + make_nullable(std::make_shared<DataTypeInt32>()), + make_nullable(std::make_shared<DataTypeString>())))); + struct_dataTypes.push_back( + make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes))); + struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeArray>( + make_nullable(std::make_shared<DataTypeInt32>())))); + + DataTypePtr data_type_ptr = + make_nullable(std::make_shared<DataTypeStruct>(struct_dataTypes)); + TEST_REPLACE + } + + { + DataTypePtr data_type_ptr = make_nullable(std::make_shared<DataTypeArray>( + make_nullable(std::make_shared<DataTypeArray>( + make_nullable(std::make_shared<DataTypeMap>( + make_nullable(std::make_shared<DataTypeInt32>()), + make_nullable(std::make_shared<DataTypeString>()))))))); + TEST_REPLACE + } +#undef TEST_REPLACE + } + } +} + } // namespace doris::vectorized \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org