This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 9b61fcbf22c3cd5e57410b6da009b7474cd9afe2
Author: amory <wangqian...@selectdb.com>
AuthorDate: Fri Jul 14 00:30:07 2023 +0800

    [FIX](map) fix map key-column nullable for arrow serde #21762
    
    arrow is not support key column has null element , but doris default map 
key column is nullable , so need to deal with if doris map row if key column 
has null element , we put null to arrow
---
 .../vec/data_types/serde/data_type_map_serde.cpp   | 15 +++-
 .../serde/data_type_serde_arrow_test.cpp           | 84 +++++++++++++++++++++-
 2 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/be/src/vec/data_types/serde/data_type_map_serde.cpp 
b/be/src/vec/data_types/serde/data_type_map_serde.cpp
index fb3eddc4ff..edb21a60ef 100644
--- a/be/src/vec/data_types/serde/data_type_map_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_map_serde.cpp
@@ -19,6 +19,7 @@
 
 #include "arrow/array/builder_nested.h"
 #include "util/jsonb_document.h"
+#include "util/simd/bits.h"
 #include "vec/columns/column.h"
 #include "vec/columns/column_const.h"
 #include "vec/columns/column_map.h"
@@ -50,13 +51,23 @@ void DataTypeMapSerDe::write_column_to_arrow(const IColumn& 
column, const NullMa
     auto& builder = assert_cast<arrow::MapBuilder&>(*array_builder);
     auto& map_column = assert_cast<const ColumnMap&>(column);
     const IColumn& nested_keys_column = map_column.get_keys();
-    CHECK(!nested_keys_column.is_nullable());
     const IColumn& nested_values_column = map_column.get_values();
+    // now we default set key value in map is nullable
+    DCHECK(nested_keys_column.is_nullable());
+    DCHECK(nested_values_column.is_nullable());
+    auto keys_nullmap_data =
+            
check_and_get_column<ColumnNullable>(nested_keys_column)->get_null_map_data().data();
     auto& offsets = map_column.get_offsets();
     auto key_builder = builder.key_builder();
     auto value_builder = builder.item_builder();
+
     for (size_t r = start; r < end; ++r) {
-        if (null_map && (*null_map)[r]) {
+        if ((null_map && (*null_map)[r])) {
+            checkArrowStatus(builder.AppendNull(), column.get_name(),
+                             array_builder->type()->name());
+        } else if (simd::contain_byte(keys_nullmap_data + offsets[r - 1],
+                                      offsets[r] - offsets[r - 1], 1)) {
+            // arrow do not support key is null so we just put null with this 
row
             checkArrowStatus(builder.AppendNull(), column.get_name(),
                              array_builder->type()->name());
         } else {
diff --git a/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp 
b/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp
index 7792d40839..c1913e6d86 100644
--- a/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp
@@ -356,7 +356,9 @@ void serialize_and_deserialize_arrow_test() {
             type_desc.add_sub_type(TYPE_STRING, true);
             tslot.__set_slotType(type_desc.to_thrift());
             {
-                DataTypePtr s = std::make_shared<DataTypeString>();
+                DataTypePtr s =
+                        
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
+                ;
                 DataTypePtr d =
                         
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
                 DataTypePtr m = std::make_shared<DataTypeMap>(s, d);
@@ -503,4 +505,84 @@ TEST(DataTypeSerDeArrowTest, DataTypeCollectionSerDeTest) {
     serialize_and_deserialize_arrow_test<false>();
 }
 
+TEST(DataTypeSerDeArrowTest, DataTypeMapNullKeySerDeTest) {
+    TupleDescriptor tuple_desc(PTupleDescriptor(), true);
+    TSlotDescriptor tslot;
+    std::string col_name = "map_null_key";
+    tslot.__set_colName(col_name);
+    TypeDescriptor type_desc(TYPE_MAP);
+    type_desc.add_sub_type(TYPE_STRING, true);
+    type_desc.add_sub_type(TYPE_INT, true);
+    tslot.__set_slotType(type_desc.to_thrift());
+    vectorized::Block block;
+    {
+        DataTypePtr s = 
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
+        ;
+        DataTypePtr d = 
std::make_shared<DataTypeNullable>(std::make_shared<DataTypeInt32>());
+        DataTypePtr m = std::make_shared<DataTypeMap>(s, d);
+        Array k1, k2, v1, v2, k3, v3;
+        k1.push_back(Null());
+        k1.push_back("doris");
+        k1.push_back("clever amory");
+        v1.push_back(11);
+        v1.push_back(Null());
+        v1.push_back(30);
+        k2.push_back("hello amory");
+        k2.push_back("NULL");
+        k2.push_back("cute amory");
+        k2.push_back("doris");
+        v2.push_back(26);
+        v2.push_back(Null());
+        v2.push_back(6);
+        v2.push_back(7);
+        k3.push_back("test");
+        k3.push_back(Null());
+        v3.push_back(11);
+        v3.push_back(30);
+        Map m1, m2, m3;
+        m1.push_back(k1);
+        m1.push_back(v1);
+        m2.push_back(k2);
+        m2.push_back(v2);
+        m3.push_back(k3);
+        m3.push_back(v3);
+        MutableColumnPtr map_column = m->create_column();
+        map_column->reserve(3);
+        map_column->insert(m1);
+        map_column->insert(m2);
+        map_column->insert(m3);
+        vectorized::ColumnWithTypeAndName type_and_name(map_column->get_ptr(), 
m, col_name);
+        block.insert(type_and_name);
+    }
+
+    tslot.__set_col_unique_id(1);
+    SlotDescriptor* slot = new SlotDescriptor(tslot);
+    tuple_desc.add_slot(slot);
+    RowDescriptor row_desc(&tuple_desc, true);
+    // arrow schema
+    std::shared_ptr<arrow::Schema> _arrow_schema;
+    EXPECT_EQ(convert_to_arrow_schema(row_desc, &_arrow_schema), Status::OK());
+
+    // serialize
+    std::shared_ptr<arrow::RecordBatch> result;
+    std::cout << "block structure: " << block.dump_structure() << std::endl;
+    std::cout << "_arrow_schema: " << _arrow_schema->ToString(true) << 
std::endl;
+
+    convert_to_arrow_batch(block, _arrow_schema, arrow::default_memory_pool(), 
&result);
+    Block new_block = block.clone_empty();
+    EXPECT_TRUE(result != nullptr);
+    std::cout << "result: " << result->ToString() << std::endl;
+    // deserialize
+    auto* array = result->GetColumnByName(col_name).get();
+    auto& column_with_type_and_name = new_block.get_by_name(col_name);
+    arrow_column_to_doris_column(array, 0, column_with_type_and_name.column,
+                                 column_with_type_and_name.type, block.rows(), 
"UTC");
+    std::cout << block.dump_data() << std::endl;
+    std::cout << new_block.dump_data() << std::endl;
+    // new block row_index 0, 2 is should be empty
+    EXPECT_EQ(new_block.dump_one_line(0, 1), "{}");
+    EXPECT_EQ(new_block.dump_one_line(2, 1), "{}");
+    EXPECT_EQ(block.dump_data(1, 1), new_block.dump_data(1, 1));
+}
+
 } // namespace doris::vectorized


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to