This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 4e8148105aa [fix](serde)Fixed the issue that serde may cause be core 
when reading schema changed text table. (#50105) (#50504)
4e8148105aa is described below

commit 4e8148105aad3c812ebdd64620eb6535df8ce953
Author: daidai <changyu...@selectdb.com>
AuthorDate: Tue Apr 29 12:54:43 2025 +0800

    [fix](serde)Fixed the issue that serde may cause be core when reading 
schema changed text table. (#50105) (#50504)
    
    bp #50105
---
 .../data_types/serde/data_type_struct_serde.cpp    |   6 +
 be/src/vec/exec/format/csv/csv_reader.cpp          |   6 +-
 .../data_types/serde/data_type_serde_csv_test.cpp  | 232 +++++++++++++++++++++
 3 files changed, 241 insertions(+), 3 deletions(-)

diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.cpp 
b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
index d48f42e2227..02e8fb17bf2 100644
--- a/be/src/vec/data_types/serde/data_type_struct_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
@@ -257,6 +257,12 @@ Status 
DataTypeStructSerDe::deserialize_one_cell_from_hive_text(
         }
     }
     auto& struct_column = static_cast<ColumnStruct&>(column);
+
+    for (auto i = slices.size(); i < struct_column.get_columns().size(); ++i) {
+        // Hive schema change will cause the number of sub-columns in the file 
to
+        // be inconsistent with the number of sub-columns of the column in the 
table.
+        slices.emplace_back(options.null_format, options.null_len);
+    }
     for (size_t loc = 0; loc < struct_column.get_columns().size(); loc++) {
         Status st = elem_serdes_ptrs[loc]->deserialize_one_cell_from_hive_text(
                 struct_column.get_column(loc), slices[loc], options,
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 660e25b2b72..5e37e4834dc 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -67,8 +67,6 @@ enum class FileCachePolicy : uint8_t;
 
 namespace doris::vectorized {
 
-const static Slice _s_null_slice = Slice("\\N");
-
 void EncloseCsvTextFieldSplitter::do_split(const Slice& line, 
std::vector<Slice>* splitted_values) {
     const char* data = line.data;
     const auto& column_sep_positions = 
_text_line_reader_ctx->column_sep_positions();
@@ -656,7 +654,9 @@ Status CsvReader::_fill_dest_columns(const Slice& line, 
Block* block,
         int col_idx = _col_idxs[i];
         // col idx is out of range, fill with null.
         const Slice& value =
-                col_idx < _split_values.size() ? _split_values[col_idx] : 
_s_null_slice;
+                col_idx < _split_values.size()
+                        ? _split_values[col_idx]
+                        : Slice {_options.null_format, 
static_cast<size_t>(_options.null_len)};
         Slice slice {value.data, value.size};
 
         IColumn* col_ptr = columns[i];
diff --git a/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp 
b/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
index 936d495cc92..b3e49fdcf8c 100644
--- a/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
@@ -19,6 +19,9 @@
 #include "olap/types.h" // for TypeInfo
 #include "olap/wrapper_field.h"
 #include "vec/columns/column.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_struct.h"
 #include "vec/common/string_buffer.hpp"
 #include "vec/core/field.h"
 #include "vec/data_types/data_type.h"
@@ -482,4 +485,233 @@ TEST(CsvSerde, ComplexTypeSerdeCsvTest) {
         EXPECT_EQ(str, rand_s_d.to_string());
     }
 }
+
+TEST(CsvSerde, ComplexTypeSerdeSchemaChangedCsvTest) {
+    { //struct<string, string> => struct<string, string, string>
+        DataTypeSerDe::FormatOptions formatOptions;
+        formatOptions.collection_delim = '\002';
+        formatOptions.map_key_delim = '\003';
+
+        string str = "false\002example";
+        DataTypes substruct_dataTypes;
+        
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+        
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+        
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+        DataTypePtr data_type_ptr =
+                
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+        auto col = data_type_ptr->create_column();
+        Slice slice(str.data(), str.size());
+        DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+        Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, 
formatOptions);
+        EXPECT_EQ(st, Status::OK());
+        auto struct_col = static_cast<ColumnStruct&>(
+                static_cast<ColumnNullable&>(*col.get()).get_nested_column());
+        EXPECT_EQ(struct_col.get_column(0).get_data_at(0).to_string(), 
"false");
+        EXPECT_EQ(struct_col.get_column(1).get_data_at(0).to_string(), 
"example");
+
+        EXPECT_EQ(struct_col.get_column(0).is_null_at(0), false);
+        EXPECT_EQ(struct_col.get_column(1).is_null_at(0), false);
+        EXPECT_EQ(struct_col.get_column(2).is_null_at(0), true);
+    }
+
+    { // Map<int,String> => array<string>
+        DataTypeSerDe::FormatOptions formatOptions;
+        formatOptions.collection_delim = '\002';
+        formatOptions.map_key_delim = '\003';
+
+        string str = "1\003example\0022\003test";
+
+        DataTypePtr data_type_ptr = make_nullable(
+                
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>())));
+
+        auto col = data_type_ptr->create_column();
+        Slice slice(str.data(), str.size());
+        DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+        Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, 
formatOptions);
+        EXPECT_EQ(st, Status::OK());
+        auto array_col = static_cast<ColumnArray&>(
+                static_cast<ColumnNullable&>(*col.get()).get_nested_column());
+
+        auto string_col = static_cast<ColumnString&>(
+                
static_cast<ColumnNullable&>(array_col.get_data()).get_nested_column());
+        EXPECT_EQ(string_col.get_data_at(0).to_string(), "1\003example");
+        EXPECT_EQ(string_col.get_data_at(1).to_string(), "2\003test");
+    }
+
+    { // null
+        DataTypeSerDe::FormatOptions formatOptions;
+        formatOptions.collection_delim = '\002';
+        formatOptions.map_key_delim = '\003';
+        std::string null_format = "null";
+        formatOptions.escape_char = '|';
+        formatOptions.null_format = null_format.data();
+        formatOptions.null_len = null_format.size();
+
+        static const string str = "null";
+
+        DataTypePtr data_type_ptr = make_nullable(
+                
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>())));
+
+        auto col = data_type_ptr->create_column();
+        Slice slice(str.data(), str.size());
+        DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+        Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, 
formatOptions);
+        EXPECT_EQ(st, Status::OK());
+        EXPECT_EQ(col->is_null_at(0), 1);
+    }
+
+    { //  \\N
+        DataTypeSerDe::FormatOptions formatOptions;
+        formatOptions.collection_delim = '\002';
+        formatOptions.map_key_delim = '\003';
+        std::string null_format = "null";
+        formatOptions.escape_char = '|';
+        formatOptions.null_format = null_format.data();
+        formatOptions.null_len = null_format.size();
+
+        static const string str = "\\N";
+        DataTypes substruct_dataTypes;
+        
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+        
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+        
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+        DataTypePtr data_type_ptr =
+                
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+        auto col = data_type_ptr->create_column();
+        Slice slice(str.data(), str.size());
+        DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+        Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, 
formatOptions);
+        EXPECT_EQ(st, Status::OK());
+        EXPECT_EQ(col->is_null_at(0), 0);
+    }
+
+    { //  \\N
+        DataTypeSerDe::FormatOptions formatOptions;
+        formatOptions.collection_delim = '\002';
+        formatOptions.map_key_delim = '\003';
+        formatOptions.escape_char = '|';
+
+        static const string str = "\\N";
+        DataTypes substruct_dataTypes;
+        
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+        
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+        
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+        DataTypePtr data_type_ptr =
+                
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+        auto col = data_type_ptr->create_column();
+        Slice slice(str.data(), str.size());
+        DataTypeSerDeSPtr serde = data_type_ptr->get_serde();
+        Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, 
formatOptions);
+        EXPECT_EQ(st, Status::OK());
+        EXPECT_EQ(col->is_null_at(0), 1);
+    }
+
+    { // random
+        auto randomControlChar = [&]() { return static_cast<char>(rand() % 7 + 
2); };
+
+        auto randomPrintableChar = []() { return static_cast<char>(rand() % 
(126 - 32 + 1) + 32); };
+
+        auto generateMixedString = [&](int n) -> std::string {
+            std::string result;
+            for (int i = 0; i < n; ++i) {
+                if (rand() % 4 == 0) {
+                    result += randomControlChar();
+                } else {
+                    result += randomPrintableChar();
+                }
+            }
+            for (unsigned char c : result) {
+                printf("\\x%02X ", c);
+            }
+            std::cout << std::endl;
+
+            return result;
+        };
+
+        std::srand(std::time(nullptr));
+
+        for (int i = 0; i < 100; i++) {
+            DataTypeSerDe::FormatOptions formatOptions;
+            formatOptions.collection_delim = '\002';
+            formatOptions.map_key_delim = '\003';
+            string str = generateMixedString(rand() % 100 + 10);
+
+#define TEST_REPLACE                                                           
         \
+    auto col = data_type_ptr->create_column();                                 
         \
+    Slice slice(str.data(), str.size());                                       
         \
+    DataTypeSerDeSPtr serde = data_type_ptr->get_serde();                      
         \
+    Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, 
formatOptions); \
+    EXPECT_EQ(st, Status::OK());
+
+            {
+                DataTypes substruct_dataTypes;
+                
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+                
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+                
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+                DataTypePtr data_type_ptr =
+                        
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes));
+
+                TEST_REPLACE
+            }
+
+            {
+                DataTypePtr data_type_ptr = std::make_shared<DataTypeMap>(
+                        make_nullable(std::make_shared<DataTypeInt32>()),
+                        make_nullable(std::make_shared<DataTypeMap>(
+                                
make_nullable(std::make_shared<DataTypeString>()),
+                                
make_nullable(std::make_shared<DataTypeInt32>()))));
+
+                TEST_REPLACE
+            }
+
+            {
+                DataTypes substruct_dataTypes;
+                
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+                
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+
+                DataTypePtr data_type_ptr = 
make_nullable(std::make_shared<DataTypeMap>(
+                        make_nullable(std::make_shared<DataTypeInt32>()),
+                        
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes))));
+                TEST_REPLACE
+            }
+
+            {
+                DataTypes substruct_dataTypes;
+                
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+                
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeString>()));
+                
substruct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeInt32>()));
+
+                DataTypes struct_dataTypes;
+                
struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeInt32>()));
+                
struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeMap>(
+                        make_nullable(std::make_shared<DataTypeInt32>()),
+                        make_nullable(std::make_shared<DataTypeString>()))));
+                struct_dataTypes.push_back(
+                        
make_nullable(std::make_shared<DataTypeStruct>(substruct_dataTypes)));
+                
struct_dataTypes.push_back(make_nullable(std::make_shared<DataTypeArray>(
+                        make_nullable(std::make_shared<DataTypeInt32>()))));
+
+                DataTypePtr data_type_ptr =
+                        
make_nullable(std::make_shared<DataTypeStruct>(struct_dataTypes));
+                TEST_REPLACE
+            }
+
+            {
+                DataTypePtr data_type_ptr = 
make_nullable(std::make_shared<DataTypeArray>(
+                        make_nullable(std::make_shared<DataTypeArray>(
+                                make_nullable(std::make_shared<DataTypeMap>(
+                                        
make_nullable(std::make_shared<DataTypeInt32>()),
+                                        
make_nullable(std::make_shared<DataTypeString>())))))));
+                TEST_REPLACE
+            }
+#undef TEST_REPLACE
+        }
+    }
+}
+
 } // namespace doris::vectorized
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to