(doris) branch branch-2.1 updated: [feat](test)add some be ut for orc/parquet reader (#49418) (#49948)

yiguolei Tue, 15 Apr 2025 21:38:58 -0700

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 6e448d3a562 [feat](test)add some be ut for orc/parquet reader  
(#49418) (#49948)
6e448d3a562 is described below

commit 6e448d3a5625423c685449a0ff433e9789c08327
Author: daidai <changyu...@selectdb.com>
AuthorDate: Wed Apr 16 12:38:45 2025 +0800

    [feat](test)add some be ut for orc/parquet reader  (#49418) (#49948)
    
    bp #49418
---
 be/src/vec/exec/format/orc/vorc_reader.cpp         |   8 +-
 be/test/vec/exec/orc/orc_convert_dict_test.cpp     | 237 ++++++++++
 .../exec/orc/orc_convert_to_orc_literal_test.cpp   | 216 ++++++++++
 be/test/vec/exec/orc/orc_memory_stream_test.h      | 102 +++++
 be/test/vec/exec/orc/orc_reader_fill_data_test.cpp | 475 +++++++++++++++++++++
 .../vec/exec/orc/orc_reader_init_column_test.cpp   | 359 ++++++++++++++++
 6 files changed, 1391 insertions(+), 6 deletions(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index a98e88c4173..83e3d9dfbb8 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -430,6 +430,8 @@ bool OrcReader::_check_acid_schema(const orc::Type& type) {
                 return false;
             }
         }
+    } else {
+        return false;
     }
     return true;
 }
@@ -1425,15 +1427,9 @@ Status OrcReader::_fill_doris_data_column(const 
std::string& col_name,
     case TypeIndex::Decimal128V3:
         return _decode_decimal_column<Decimal128V3, is_filter>(col_name, 
data_column, data_type,
                                                                cvb, 
num_values);
-    case TypeIndex::Date:
-        return _decode_time_column<VecDateTimeValue, Int64, 
orc::LongVectorBatch, is_filter>(
-                col_name, data_column, cvb, num_values);
     case TypeIndex::DateV2:
         return _decode_time_column<DateV2Value<DateV2ValueType>, UInt32, 
orc::LongVectorBatch,
                                    is_filter>(col_name, data_column, cvb, 
num_values);
-    case TypeIndex::DateTime:
-        return _decode_time_column<VecDateTimeValue, Int64, 
orc::TimestampVectorBatch, is_filter>(
-                col_name, data_column, cvb, num_values);
     case TypeIndex::DateTimeV2:
         return _decode_time_column<DateV2Value<DateTimeV2ValueType>, UInt64,
                                    orc::TimestampVectorBatch, 
is_filter>(col_name, data_column, cvb,
diff --git a/be/test/vec/exec/orc/orc_convert_dict_test.cpp 
b/be/test/vec/exec/orc/orc_convert_dict_test.cpp
new file mode 100644
index 00000000000..bce08cc63db
--- /dev/null
+++ b/be/test/vec/exec/orc/orc_convert_dict_test.cpp
@@ -0,0 +1,237 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "orc/ColumnPrinter.hh"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_struct.h"
+#include "vec/exec/format/orc/vorc_reader.h"
+
+namespace doris {
+namespace vectorized {
+class OrcReaderConvertDictTest : public ::testing::Test {
+protected:
+    void SetUp() override {}
+
+    void TearDown() override {}
+};
+
+std::unique_ptr<orc::EncodedStringVectorBatch> create_encoded_string_batch(
+        const std::vector<std::string>& dict_values) {
+    auto batch =
+            std::make_unique<orc::EncodedStringVectorBatch>(1024 * 1024, 
*orc::getDefaultPool());
+    batch->dictionary = 
std::make_unique<orc::StringDictionary>(*orc::getDefaultPool());
+
+    // Fill dictionary data
+    int sz = 0;
+    for (const auto& value : dict_values) {
+        sz += value.length();
+    }
+
+    batch->dictionary->dictionaryBlob.resize(sz + 1024);
+    batch->dictionary->dictionaryOffset.resize(dict_values.size() + 10);
+    int x = 0;
+    for (const auto& value : dict_values) {
+        batch->dictionary->dictionaryOffset[x + 1] =
+                batch->dictionary->dictionaryOffset[x] + value.size();
+        int y = batch->dictionary->dictionaryOffset[x];
+        for (auto ch : value) {
+            batch->dictionary->dictionaryBlob[y] = ch;
+            y++;
+        }
+        x++;
+    }
+
+    return batch;
+}
+
+TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnBasic) {
+    // Prepare dictionary data
+    std::vector<std::string> dict_values = {"hello", "world", "doris", "test"};
+    auto string_batch = create_encoded_string_batch(dict_values);
+
+    // Prepare dictionary index column
+    auto dict_column = ColumnInt32::create();
+    std::vector<int32_t> indices = {0, 1, 2,
+                                    3, 1, 0}; // "hello", "world", "doris", 
"test", "world", "hello"
+    for (auto x : indices) {
+        dict_column->insert(x);
+    }
+
+    // Create ORC type
+    auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
+
+    TFileScanRangeParams params;
+    TFileRangeDesc range;
+    auto reader = OrcReader::create_unique(params, range, "", nullptr, true);
+
+    // Execute conversion
+    auto result_column = reader->_convert_dict_column_to_string_column(
+            dict_column.get(), nullptr, string_batch.get(), 
orc_type_ptr.get());
+
+    // Validate results
+    auto* string_column = assert_cast<const 
ColumnString*>(result_column.get());
+    ASSERT_EQ(string_column->size(), 6);
+    ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello");
+    ASSERT_EQ(string_column->get_data_at(1).to_string(), "world");
+    ASSERT_EQ(string_column->get_data_at(2).to_string(), "doris");
+    ASSERT_EQ(string_column->get_data_at(3).to_string(), "test");
+    ASSERT_EQ(string_column->get_data_at(4).to_string(), "world");
+    ASSERT_EQ(string_column->get_data_at(5).to_string(), "hello");
+}
+
+TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnWithNulls) {
+    // Prepare dictionary data
+    std::vector<std::string> dict_values = {"hello", "world", "doris"};
+    auto string_batch = create_encoded_string_batch(dict_values);
+
+    // Prepare dictionary index column
+    auto dict_column = ColumnInt32::create();
+    std::vector<int32_t> indices = {0, 1, 2, 1, 0};
+    for (auto x : indices) {
+        dict_column->insert(x);
+    }
+
+    // Prepare null map
+    NullMap null_map = {0, 1, 0, 0, 1}; // 2nd and 5th elements are null
+
+    // Create ORC type
+    auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
+
+    TFileScanRangeParams params;
+    TFileRangeDesc range;
+    auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+
+    // Execute conversion
+    auto result_column = _reader->_convert_dict_column_to_string_column(
+            dict_column.get(), &null_map, string_batch.get(), 
orc_type_ptr.get());
+
+    // Validate results
+    auto* string_column = assert_cast<const 
ColumnString*>(result_column.get());
+    ASSERT_EQ(string_column->size(), 5);
+    ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello");
+    ASSERT_EQ(string_column->get_data_at(1).to_string(), ""); // null value
+    ASSERT_EQ(string_column->get_data_at(2).to_string(), "doris");
+    ASSERT_EQ(string_column->get_data_at(3).to_string(), "world");
+    ASSERT_EQ(string_column->get_data_at(4).to_string(), ""); // null value
+}
+
+TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnChar) {
+    // Prepare dictionary data (CHAR type with right-padded spaces)
+    std::vector<std::string> dict_values = {"hello  ", "world  ", "test   "};
+    auto string_batch = create_encoded_string_batch(dict_values);
+
+    // Prepare dictionary index column
+    auto dict_column = ColumnInt32::create();
+    std::vector<int32_t> indices = {0, 1, 2, 1};
+    for (auto x : indices) {
+        dict_column->insert(x);
+    }
+
+    // Create ORC CHAR type
+    auto orc_type_ptr = createPrimitiveType(orc::TypeKind::CHAR);
+    TFileScanRangeParams params;
+    TFileRangeDesc range;
+    auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+
+    // Execute conversion
+    auto result_column = _reader->_convert_dict_column_to_string_column(
+            dict_column.get(), nullptr, string_batch.get(), 
orc_type_ptr.get());
+
+    // Validate results (should remove trailing spaces)
+    auto* string_column = assert_cast<const 
ColumnString*>(result_column.get());
+    ASSERT_EQ(string_column->size(), 4);
+    ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello"); // spaces 
removed
+    ASSERT_EQ(string_column->get_data_at(1).to_string(), "world"); // spaces 
removed
+    ASSERT_EQ(string_column->get_data_at(2).to_string(), "test");  // spaces 
removed
+    ASSERT_EQ(string_column->get_data_at(3).to_string(), "world"); // spaces 
removed
+}
+
+TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnEmpty) {
+    // Prepare empty dictionary data
+    std::vector<std::string> dict_values = {""};
+    auto string_batch = create_encoded_string_batch(dict_values);
+
+    // Prepare dictionary index column
+    auto dict_column = ColumnInt32::create();
+    std::vector<int32_t> indices = {0, 0, 0};
+    for (auto x : indices) {
+        dict_column->insert(x);
+    }
+
+    // Create ORC type
+    auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
+    TFileScanRangeParams params;
+    TFileRangeDesc range;
+    auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+    // Execute conversion
+    auto result_column = _reader->_convert_dict_column_to_string_column(
+            dict_column.get(), nullptr, string_batch.get(), 
orc_type_ptr.get());
+
+    // Validate results
+    auto* string_column = assert_cast<const 
ColumnString*>(result_column.get());
+    ASSERT_EQ(string_column->size(), 3);
+    ASSERT_EQ(string_column->get_data_at(0).to_string(), "");
+    ASSERT_EQ(string_column->get_data_at(1).to_string(), "");
+    ASSERT_EQ(string_column->get_data_at(2).to_string(), "");
+}
+
+TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnMixed) {
+    // Prepare mixed length dictionary data
+    std::vector<std::string> dict_values = {"", "a", "ab", "abc", "abcd"};
+    auto string_batch = create_encoded_string_batch(dict_values);
+
+    // Prepare dictionary index column
+    auto dict_column = ColumnInt32::create();
+    std::vector<int32_t> indices = {0, 1, 2, 3, 4, 2, 1, 0};
+    for (auto x : indices) {
+        dict_column->insert(x);
+    }
+
+    // Prepare partial null values
+    NullMap null_map = {0, 0, 1, 0, 0, 1, 0, 0};
+
+    // Create ORC type
+    auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
+    TFileScanRangeParams params;
+    TFileRangeDesc range;
+    auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+    // Execute conversion
+    auto result_column = _reader->_convert_dict_column_to_string_column(
+            dict_column.get(), &null_map, string_batch.get(), 
orc_type_ptr.get());
+
+    // Validate results
+    auto* string_column = assert_cast<const 
ColumnString*>(result_column.get());
+    ASSERT_EQ(string_column->size(), 8);
+    ASSERT_EQ(string_column->get_data_at(0).to_string(), "");
+    ASSERT_EQ(string_column->get_data_at(1).to_string(), "a");
+    ASSERT_EQ(string_column->get_data_at(2).to_string(), ""); // null
+    ASSERT_EQ(string_column->get_data_at(3).to_string(), "abc");
+    ASSERT_EQ(string_column->get_data_at(4).to_string(), "abcd");
+    ASSERT_EQ(string_column->get_data_at(5).to_string(), ""); // null
+    ASSERT_EQ(string_column->get_data_at(6).to_string(), "a");
+    ASSERT_EQ(string_column->get_data_at(7).to_string(), "");
+}
+
+} // namespace vectorized
+
+} // namespace doris
diff --git a/be/test/vec/exec/orc/orc_convert_to_orc_literal_test.cpp 
b/be/test/vec/exec/orc/orc_convert_to_orc_literal_test.cpp
new file mode 100644
index 00000000000..ac79f22a6bb
--- /dev/null
+++ b/be/test/vec/exec/orc/orc_convert_to_orc_literal_test.cpp
@@ -0,0 +1,216 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "orc/ColumnPrinter.hh"
+#include "vec/columns/column_struct.h"
+#include "vec/core/types.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/exec/format/orc/vorc_reader.cpp"
+
+namespace doris {
+namespace vectorized {
+class OrcReaderConvertToOrcLiteralTest : public ::testing::Test {
+protected:
+    void SetUp() override {}
+
+    void TearDown() override {}
+};
+
+TEST_F(OrcReaderConvertToOrcLiteralTest, ConvertTypesTest) {
+    // TINYINT test
+    {
+        int8_t tiny_value = 127;
+        StringRef literal_data(reinterpret_cast<char*>(&tiny_value), 
sizeof(tiny_value));
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::BYTE);
+        auto [success, literal] =
+                convert_to_orc_literal<TYPE_TINYINT>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success);
+        ASSERT_EQ(literal.getLong(), 127);
+    }
+
+    // SMALLINT test
+    {
+        int16_t small_value = 32000;
+        StringRef literal_data(reinterpret_cast<char*>(&small_value), 
sizeof(small_value));
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::SHORT);
+        auto [success, literal] =
+                convert_to_orc_literal<TYPE_SMALLINT>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success);
+        ASSERT_EQ(literal.getLong(), 32000);
+    }
+
+    // INT test
+    {
+        int32_t int_value = 2147483647;
+        StringRef literal_data(reinterpret_cast<char*>(&int_value), 
sizeof(int_value));
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::INT);
+        auto [success, literal] =
+                convert_to_orc_literal<TYPE_INT>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success);
+        ASSERT_EQ(literal.getLong(), 2147483647);
+    }
+
+    // BIGINT test
+    {
+        int64_t big_value = 9223372036854775807LL;
+        StringRef literal_data(reinterpret_cast<char*>(&big_value), 
sizeof(big_value));
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::LONG);
+        auto [success, literal] =
+                convert_to_orc_literal<TYPE_BIGINT>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success);
+        ASSERT_EQ(literal.getLong(), 9223372036854775807LL);
+    }
+    // FLOAT test
+    {
+        float float_value = 3.14159f;
+        StringRef literal_data(reinterpret_cast<char*>(&float_value), 
sizeof(float_value));
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::FLOAT);
+        auto [success, literal] =
+                convert_to_orc_literal<TYPE_FLOAT>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success);
+        ASSERT_NEAR(literal.getFloat(), 3.14159f, 0.0001);
+    }
+
+    // DOUBLE test
+    {
+        double double_value = 3.14159265358979323846;
+        StringRef literal_data(reinterpret_cast<char*>(&double_value), 
sizeof(double_value));
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DOUBLE);
+        auto [success, literal] =
+                convert_to_orc_literal<TYPE_DOUBLE>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success);
+        ASSERT_DOUBLE_EQ(literal.getFloat(), 3.14159265358979323846);
+    }
+    // STRING test
+    {
+        std::string str_value = "Hello, World!";
+        StringRef literal_data(str_value.data(), str_value.size());
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
+        auto [success, literal] =
+                convert_to_orc_literal<TYPE_STRING>(orc_type_ptr.get(), 
(void*)&literal_data, 0, 0);
+        ASSERT_TRUE(success);
+        ASSERT_EQ(std::string(literal.getString().data(), 
literal.getString().length()),
+                  "Hello, World!");
+    }
+
+    // DECIMAL32 test
+    {
+        int32_t decimal32_value = 12345;
+        StringRef literal_data(reinterpret_cast<const char*>(&decimal32_value),
+                               sizeof(decimal32_value));
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DECIMAL);
+        auto [success, literal] =
+                convert_to_orc_literal<TYPE_DECIMAL32>(orc_type_ptr.get(), 
literal_data.data, 9, 4);
+        ASSERT_TRUE(success);
+        ASSERT_EQ(literal.getDecimal().toString(), "1.2345");
+    }
+
+    // DECIMAL64 test
+    {
+        int64_t decimal64_value = 123456789012345LL;
+        StringRef literal_data(reinterpret_cast<const char*>(&decimal64_value),
+                               sizeof(decimal64_value));
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DECIMAL);
+        auto [success, literal] = 
convert_to_orc_literal<TYPE_DECIMAL64>(orc_type_ptr.get(),
+                                                                         
literal_data.data, 18, 6);
+        ASSERT_TRUE(success);
+        ASSERT_EQ(literal.getDecimal().toString(), "123456789.012345");
+    }
+
+    // DECIMAL128 test
+    {
+        int128_t decimal128_value = 1234512345;
+        StringRef literal_data(reinterpret_cast<const 
char*>(&decimal128_value),
+                               sizeof(decimal128_value));
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DECIMAL);
+        auto [success, literal] = convert_to_orc_literal<TYPE_DECIMAL128I>(
+                orc_type_ptr.get(), literal_data.data, 38, 9);
+        ASSERT_TRUE(success);
+        ASSERT_EQ(literal.getDecimal().toString(), "1.234512345");
+    }
+
+    {
+        // Normal date
+        VecDateTimeValue date_value;
+        date_value.from_date_str("2024-03-14", 10);
+        StringRef literal_data(reinterpret_cast<const char*>(&date_value), 
sizeof(date_value));
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DATE);
+        auto [success, literal] =
+                convert_to_orc_literal<TYPE_DATE>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success);
+
+        // Verify converted day offset
+        int64_t expected_days = 19796; // Day count for 2024-03-14
+        ASSERT_EQ(literal.getDate(), expected_days);
+
+        // Boundary date - minimum value
+        date_value.from_date_str("0001-01-01", 10);
+        literal_data = StringRef(reinterpret_cast<const char*>(&date_value), 
sizeof(date_value));
+        std::tie(success, literal) =
+                convert_to_orc_literal<TYPE_DATE>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success); //-719162
+        ASSERT_EQ(literal.getDate(), -719162);
+
+        // Boundary date - maximum value
+        date_value.from_date_str("9999-12-31", 10);
+        literal_data = StringRef(reinterpret_cast<const char*>(&date_value), 
sizeof(date_value));
+        std::tie(success, literal) =
+                convert_to_orc_literal<TYPE_DATE>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success); //
+        ASSERT_EQ(literal.getDate(), 2932896);
+    }
+
+    // DATETIME type test
+    {
+        // Normal timestamp
+        VecDateTimeValue datetime_value;
+        datetime_value.from_date_str("2024-03-14 15:30:45", 19);
+        StringRef literal_data(reinterpret_cast<const char*>(&datetime_value),
+                               sizeof(datetime_value));
+        auto orc_type_ptr = createPrimitiveType(orc::TypeKind::TIMESTAMP);
+        auto [success, literal] =
+                convert_to_orc_literal<TYPE_DATETIME>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success);
+
+        // Verify seconds and nanoseconds
+        ASSERT_EQ(literal.getTimestamp().getMillis(), 1710430245000); //
+
+        // Midnight time
+        datetime_value.from_date_str("2024-03-14 00:00:00", 19);
+        literal_data =
+                StringRef(reinterpret_cast<const char*>(&datetime_value), 
sizeof(datetime_value));
+        std::tie(success, literal) =
+                convert_to_orc_literal<TYPE_DATETIME>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success);
+        ASSERT_EQ(literal.getTimestamp().getMillis(), 1710374400000); //
+
+        // Leap year handling
+        datetime_value.from_date_str("2024-02-29 12:00:00", 19);
+        literal_data =
+                StringRef(reinterpret_cast<const char*>(&datetime_value), 
sizeof(datetime_value));
+        std::tie(success, literal) =
+                convert_to_orc_literal<TYPE_DATETIME>(orc_type_ptr.get(), 
literal_data.data, 0, 0);
+        ASSERT_TRUE(success);
+        ASSERT_EQ(literal.getTimestamp().getMillis(), 1709208000000); //
+    }
+}
+} // namespace vectorized
+} // namespace doris
diff --git a/be/test/vec/exec/orc/orc_memory_stream_test.h 
b/be/test/vec/exec/orc/orc_memory_stream_test.h
new file mode 100644
index 00000000000..52c9daad591
--- /dev/null
+++ b/be/test/vec/exec/orc/orc_memory_stream_test.h
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "orc/ColumnPrinter.hh"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_map.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_struct.h"
+#include "vec/columns/columns_number.h"
+#include "vec/core/types.h"
+#include "vec/data_types/data_type_array.h"
+#include "vec/data_types/data_type_date.h"
+#include "vec/data_types/data_type_date_time.h"
+#include "vec/data_types/data_type_decimal.h"
+#include "vec/data_types/data_type_map.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/data_types/data_type_struct.h"
+#include "vec/exec/format/orc/vorc_reader.h"
+
+namespace doris {
+namespace vectorized {
+
+class MemoryOutputStream : public orc::OutputStream {
+public:
+    MemoryOutputStream(size_t capacity) : name("MemoryOutputStream") {
+        data = new char[capacity];
+        length = 0;
+        naturalWriteSize = 2048;
+    }
+
+    virtual ~MemoryOutputStream() override { delete[] data; };
+
+    virtual uint64_t getLength() const override { return length; }
+
+    virtual uint64_t getNaturalWriteSize() const override { return 
naturalWriteSize; }
+
+    virtual void write(const void* buf, size_t size) override {
+        memcpy(data + length, buf, size);
+        length += size;
+    }
+
+    virtual const std::string& getName() const override { return name; }
+
+    const char* getData() const { return data; }
+
+    void close() override {}
+
+private:
+    char* data;
+    std::string name;
+    uint64_t length, naturalWriteSize;
+};
+
+class MemoryInputStream : public orc::InputStream {
+public:
+    MemoryInputStream(const char* _buffer, size_t _size)
+            : buffer(_buffer), size(_size), naturalReadSize(1024), 
name("MemoryInputStream") {}
+
+    ~MemoryInputStream() override {}
+
+    virtual uint64_t getLength() const override { return size; }
+
+    virtual uint64_t getNaturalReadSize() const override { return 
naturalReadSize; }
+
+    virtual void read(void* buf, uint64_t length, uint64_t offset) override {
+        memcpy(buf, buffer + offset, length);
+    }
+
+    virtual const std::string& getName() const override { return name; }
+
+    //    const char* getData() const {
+    //        return buffer;
+    //    }
+
+private:
+    const char* buffer;
+    uint64_t size, naturalReadSize;
+    std::string name;
+};
+} // namespace vectorized
+} // namespace doris
\ No newline at end of file
diff --git a/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp 
b/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp
new file mode 100644
index 00000000000..d896419a338
--- /dev/null
+++ b/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp
@@ -0,0 +1,475 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "orc/ColumnPrinter.hh"
+#include "orc_memory_stream_test.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_struct.h"
+#include "vec/core/types.h"
+#include "vec/data_types/data_type_array.h"
+#include "vec/data_types/data_type_decimal.h"
+#include "vec/data_types/data_type_map.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_struct.h"
+#include "vec/exec/format/orc/vorc_reader.h"
+
+namespace doris {
+namespace vectorized {
+class OrcReaderFillDataTest : public ::testing::Test {
+protected:
+    void SetUp() override {}
+
+    void TearDown() override {}
+};
+
+std::unique_ptr<orc::LongVectorBatch> create_long_batch(size_t size,
+                                                        const 
std::vector<int64_t>& values,
+                                                        const 
std::vector<bool>& nulls = {}) {
+    auto batch = std::make_unique<orc::LongVectorBatch>(size, 
*orc::getDefaultPool());
+    batch->resize(size);
+    batch->notNull.resize(size);
+
+    bool has_nulls = nulls.size() == size;
+    for (size_t i = 0; i < size; ++i) {
+        if (has_nulls) {
+            batch->notNull[i] = !nulls[i];
+        } else {
+            batch->notNull[i] = true;
+        }
+
+        if (batch->notNull[i]) {
+            batch->data[i] = values[i];
+        }
+    }
+
+    if (has_nulls) {
+        batch->hasNulls = true;
+    } else {
+        batch->hasNulls = false;
+    }
+    return batch;
+}
+
+TEST_F(OrcReaderFillDataTest, TestFillLongColumn) {
+    std::vector<int64_t> values = {1, 2, 3, 4, 5};
+    auto batch = create_long_batch(values.size(), values);
+    auto column = ColumnInt64::create();
+    auto data_type = std::make_shared<DataTypeInt64>();
+
+    auto orc_type_ptr = createPrimitiveType(orc::TypeKind::LONG);
+
+    TFileScanRangeParams params;
+    TFileRangeDesc range;
+    auto reader = OrcReader::create_unique(params, range, "", nullptr, true);
+
+    MutableColumnPtr xx = column->assume_mutable();
+
+    Status status = reader->_fill_doris_data_column<false>(
+            "test_long", xx, data_type, orc_type_ptr.get(), batch.get(), 
values.size());
+
+    ASSERT_TRUE(status.ok());
+    ASSERT_EQ(column->size(), values.size());
+
+    for (size_t i = 0; i < values.size(); ++i) {
+        ASSERT_EQ(column->get_int(i), values[i]);
+    }
+}
+
+TEST_F(OrcReaderFillDataTest, TestFillLongColumnWithNull) {
+    std::vector<int64_t> values = {1, 2, 3, 4, 5};
+    std::vector<bool> nulls = {false, true, false, true, false};
+    auto batch = create_long_batch(values.size(), values, nulls);
+    auto column = ColumnInt64::create();
+    auto data_type = std::make_shared<DataTypeInt64>();
+
+    auto orc_type_ptr = createPrimitiveType(orc::TypeKind::LONG);
+
+    TFileScanRangeParams params;
+    TFileRangeDesc range;
+    auto reader = OrcReader::create_unique(params, range, "", nullptr, true);
+
+    MutableColumnPtr xx = column->assume_mutable();
+
+    Status status = reader->_fill_doris_data_column<false>(
+            "test_long_with_null", xx, data_type, orc_type_ptr.get(), 
batch.get(), values.size());
+
+    ASSERT_TRUE(status.ok());
+    ASSERT_EQ(column->size(), values.size());
+
+    for (size_t i = 0; i < values.size(); ++i) {
+        if (!nulls[i]) {
+            ASSERT_EQ(column->get_int(i), values[i]);
+        }
+    }
+}
+
+TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) {
+    // Array类型测试
+    {
+        using namespace orc;
+        std::unique_ptr<orc::Type> 
type(orc::Type::buildTypeFromString("struct<col1:array<int>>"));
+
+        WriterOptions options;
+        options.setMemoryPool(orc::getDefaultPool());
+
+        MemoryOutputStream memStream(100 * 1024 * 1024);
+        std::unique_ptr<orc::Writer> writer = orc::createWriter(*type, 
&memStream, options);
+
+        std::unique_ptr<orc::ColumnVectorBatch> batch = 
writer->createRowBatch(1024);
+        orc::StructVectorBatch* structBatch = 
dynamic_cast<orc::StructVectorBatch*>(batch.get());
+        orc::ListVectorBatch* listBatch =
+                dynamic_cast<orc::ListVectorBatch*>(structBatch->fields[0]);
+        orc::LongVectorBatch* intBatch =
+                dynamic_cast<orc::LongVectorBatch*>(listBatch->elements.get());
+        int64_t* data = intBatch->data.data();
+        int64_t* offsets = listBatch->offsets.data();
+        uint64_t rowCount = 20;
+        uint64_t offset = 0;
+        uint64_t maxListLength = 5;
+        for (uint64_t i = 0; i < rowCount; ++i) {
+            offsets[i] = static_cast<int64_t>(offset);
+            for (uint64_t length = i % maxListLength + 1; length != 0; 
--length) {
+                data[offset++] = static_cast<int64_t>(i);
+            }
+        }
+        offsets[rowCount] = static_cast<int64_t>(offset);
+
+        structBatch->numElements = rowCount;
+        listBatch->numElements = rowCount;
+
+        TFileScanRangeParams params;
+        TFileRangeDesc range;
+        auto reader = OrcReader::create_unique(params, range, "", nullptr, 
true);
+
+        auto doris_struct_type = std::make_shared<DataTypeStruct>(
+                std::vector<DataTypePtr> {
+                        
std::make_shared<DataTypeArray>(std::make_shared<DataTypeInt32>())},
+                std::vector<std::string> {"col1"});
+        MutableColumnPtr doris_column = 
doris_struct_type->create_column()->assume_mutable();
+
+        Status status = reader->_fill_doris_data_column<false>(
+                "test", doris_column, doris_struct_type, type.get(), 
structBatch, rowCount);
+
+        ASSERT_TRUE(status.ok());
+        std::string line;
+        std::unique_ptr<orc::ColumnPrinter> printer = 
orc::createColumnPrinter(line, type.get());
+        printer->reset(*structBatch);
+
+        for (int i = 0; i < rowCount; i++) {
+            line.clear();
+            printer->printRow(i);
+            std::cout << "line = " << line << "\n";
+        }
+        Block block {std::vector<ColumnWithTypeAndName> {
+                {doris_column->get_ptr(), doris_struct_type, "cc"}}};
+        std::cout << block.dump_data() << "\n";
+
+        ASSERT_EQ(block.dump_data(),
+                  "+-----------------------------+\n"
+                  "|cc(Struct(col1:Array(Int32)))|\n"
+                  "+-----------------------------+\n"
+                  "|                        {[0]}|\n"
+                  "|                     {[1, 1]}|\n"
+                  "|                  {[2, 2, 2]}|\n"
+                  "|               {[3, 3, 3, 3]}|\n"
+                  "|            {[4, 4, 4, 4, 4]}|\n"
+                  "|                        {[5]}|\n"
+                  "|                     {[6, 6]}|\n"
+                  "|                  {[7, 7, 7]}|\n"
+                  "|               {[8, 8, 8, 8]}|\n"
+                  "|            {[9, 9, 9, 9, 9]}|\n"
+                  "|                       {[10]}|\n"
+                  "|                   {[11, 11]}|\n"
+                  "|               {[12, 12, 12]}|\n"
+                  "|           {[13, 13, 13, 13]}|\n"
+                  "|       {[14, 14, 14, 14, 14]}|\n"
+                  "|                       {[15]}|\n"
+                  "|                   {[16, 16]}|\n"
+                  "|               {[17, 17, 17]}|\n"
+                  "|           {[18, 18, 18, 18]}|\n"
+                  "|       {[19, 19, 19, 19, 19]}|\n"
+                  "+-----------------------------+\n");
+    }
+
+    {
+        using namespace orc;
+        auto type = 
std::unique_ptr<Type>(Type::buildTypeFromString("struct<col1:int,col2:int>"));
+
+        size_t rowCount = 10;
+        MemoryOutputStream memStream(100 * 1024 * 1024);
+        WriterOptions options;
+        options.setMemoryPool(getDefaultPool());
+        auto writer = createWriter(*type, &memStream, options);
+        auto batch = writer->createRowBatch(rowCount);
+        auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+        auto& longBatch1 = 
dynamic_cast<LongVectorBatch&>(*structBatch.fields[0]);
+        auto& longBatch2 = 
dynamic_cast<LongVectorBatch&>(*structBatch.fields[1]);
+        structBatch.numElements = rowCount;
+        longBatch1.numElements = rowCount;
+        longBatch2.numElements = rowCount;
+        for (size_t i = 0; i < rowCount; ++i) {
+            longBatch1.data[i] = static_cast<int64_t>(i * 100);
+            longBatch2.data[i] = static_cast<int64_t>(i * 300);
+        }
+
+        std::string line;
+        std::unique_ptr<orc::ColumnPrinter> printer = 
orc::createColumnPrinter(line, type.get());
+        printer->reset(structBatch);
+
+        for (int i = 0; i < rowCount; i++) {
+            line.clear();
+            printer->printRow(i);
+            std::cout << "line = " << line << "\n";
+        }
+
+        TFileScanRangeParams params;
+        TFileRangeDesc range;
+        auto reader = OrcReader::create_unique(params, range, "", nullptr, 
true);
+
+        auto doris_struct_type = std::make_shared<DataTypeStruct>(
+                std::vector<DataTypePtr> {std::make_shared<DataTypeInt32>(),
+                                          std::make_shared<DataTypeInt32>()},
+                std::vector<std::string> {"col1", "col2"});
+        MutableColumnPtr doris_column = 
doris_struct_type->create_column()->assume_mutable();
+
+        Status status = reader->_fill_doris_data_column<false>(
+                "test", doris_column, doris_struct_type, type.get(), 
&structBatch, rowCount);
+
+        ASSERT_TRUE(status.ok());
+
+        Block block {std::vector<ColumnWithTypeAndName> {
+                {doris_column->get_ptr(), doris_struct_type, "cc"}}};
+        std::cout << block.dump_data() << "\n";
+
+        ASSERT_EQ(block.dump_data(),
+                  "+----------------------------------+\n"
+                  "|cc(Struct(col1:Int32, col2:Int32))|\n"
+                  "+----------------------------------+\n"
+                  "|                            {0, 0}|\n"
+                  "|                        {100, 300}|\n"
+                  "|                        {200, 600}|\n"
+                  "|                        {300, 900}|\n"
+                  "|                       {400, 1200}|\n"
+                  "|                       {500, 1500}|\n"
+                  "|                       {600, 1800}|\n"
+                  "|                       {700, 2100}|\n"
+                  "|                       {800, 2400}|\n"
+                  "|                       {900, 2700}|\n"
+                  "+----------------------------------+\n");
+    }
+
+    {
+        using namespace orc;
+
+        const uint64_t maxPrecision = 18;
+        MemoryOutputStream memStream(100 * 1024 * 102);
+        MemoryPool* pool = getDefaultPool();
+        std::unique_ptr<Type> 
type(Type::buildTypeFromString("struct<col1:decimal(18,5)>"));
+        WriterOptions options;
+        options.setMemoryPool(pool);
+
+        uint64_t rowCount = 5;
+        std::unique_ptr<Writer> writer = createWriter(*type, &memStream, 
options);
+        std::unique_ptr<ColumnVectorBatch> batch =
+                writer->createRowBatch(2 * rowCount + 2 * maxPrecision);
+        StructVectorBatch* structBatch = 
dynamic_cast<StructVectorBatch*>(batch.get());
+        Decimal64VectorBatch* decBatch =
+                dynamic_cast<Decimal64VectorBatch*>(structBatch->fields[0]);
+        decBatch->scale = 5;
+        decBatch->precision = 18;
+        // write positive decimals
+        for (uint64_t i = 0; i < rowCount; ++i) {
+            decBatch->values[i] = static_cast<int64_t>(i + 10000);
+        }
+
+        // write negative decimals
+        for (uint64_t i = rowCount; i < 2 * rowCount; ++i) {
+            decBatch->values[i] = static_cast<int64_t>(i - 10000);
+        }
+
+        // write all precision decimals
+        int64_t dec = 0;
+        for (uint64_t i = 2 * rowCount; i < 2 * rowCount + 2 * maxPrecision; i 
+= 2) {
+            dec = dec * 10 + 9;
+            decBatch->values[i] = dec;
+            decBatch->values[i + 1] = -dec;
+        }
+        rowCount = 2 * (rowCount + maxPrecision);
+        structBatch->numElements = decBatch->numElements = rowCount;
+
+        std::string line;
+        std::unique_ptr<orc::ColumnPrinter> printer = 
orc::createColumnPrinter(line, type.get());
+        printer->reset(*structBatch);
+
+        for (int i = 0; i < rowCount; i++) {
+            line.clear();
+            printer->printRow(i);
+            std::cout << "line = " << line << "\n";
+        }
+
+        TFileScanRangeParams params;
+        TFileRangeDesc range;
+        auto reader = OrcReader::create_unique(params, range, "", nullptr, 
true);
+
+        auto doris_struct_type = std::make_shared<DataTypeStruct>(
+                std::vector<DataTypePtr> 
{std::make_shared<DataTypeDecimal<Decimal64>>(18, 5)},
+                std::vector<std::string> {"col1"});
+        MutableColumnPtr doris_column = 
doris_struct_type->create_column()->assume_mutable();
+        reader->_decimal_scale_params.resize(0);
+        reader->_decimal_scale_params_index = 0;
+        Status status = reader->_fill_doris_data_column<false>(
+                "test", doris_column, doris_struct_type, type.get(), 
structBatch, rowCount);
+
+        ASSERT_TRUE(status.ok());
+
+        Block block {std::vector<ColumnWithTypeAndName> {
+                {doris_column->get_ptr(), doris_struct_type, "cc"}}};
+        std::cout << block.dump_data() << "\n";
+        ASSERT_EQ(block.dump_data(),
+                  "+-------------------------------+\n"
+                  "|cc(Struct(col1:Decimal(18, 5)))|\n"
+                  "+-------------------------------+\n"
+                  "|                      {0.10000}|\n"
+                  "|                      {0.10001}|\n"
+                  "|                      {0.10002}|\n"
+                  "|                      {0.10003}|\n"
+                  "|                      {0.10004}|\n"
+                  "|                     {-0.09995}|\n"
+                  "|                     {-0.09994}|\n"
+                  "|                     {-0.09993}|\n"
+                  "|                     {-0.09992}|\n"
+                  "|                     {-0.09991}|\n"
+                  "|                      {0.00009}|\n"
+                  "|                     {-0.00009}|\n"
+                  "|                      {0.00099}|\n"
+                  "|                     {-0.00099}|\n"
+                  "|                      {0.00999}|\n"
+                  "|                     {-0.00999}|\n"
+                  "|                      {0.09999}|\n"
+                  "|                     {-0.09999}|\n"
+                  "|                      {0.99999}|\n"
+                  "|                     {-0.99999}|\n"
+                  "|                      {9.99999}|\n"
+                  "|                     {-9.99999}|\n"
+                  "|                     {99.99999}|\n"
+                  "|                    {-99.99999}|\n"
+                  "|                    {999.99999}|\n"
+                  "|                   {-999.99999}|\n"
+                  "|                   {9999.99999}|\n"
+                  "|                  {-9999.99999}|\n"
+                  "|                  {99999.99999}|\n"
+                  "|                 {-99999.99999}|\n"
+                  "|                 {999999.99999}|\n"
+                  "|                {-999999.99999}|\n"
+                  "|                {9999999.99999}|\n"
+                  "|               {-9999999.99999}|\n"
+                  "|               {99999999.99999}|\n"
+                  "|              {-99999999.99999}|\n"
+                  "|              {999999999.99999}|\n"
+                  "|             {-999999999.99999}|\n"
+                  "|             {9999999999.99999}|\n"
+                  "|            {-9999999999.99999}|\n"
+                  "|            {99999999999.99999}|\n"
+                  "|           {-99999999999.99999}|\n"
+                  "|           {999999999999.99999}|\n"
+                  "|          {-999999999999.99999}|\n"
+                  "|          {9999999999999.99999}|\n"
+                  "|         {-9999999999999.99999}|\n"
+                  "+-------------------------------+\n");
+    }
+
+    {
+        using namespace orc;
+        size_t rowCount = 10;
+        MemoryOutputStream memStream(100 * 1024 * 1024);
+        MemoryPool* pool = getDefaultPool();
+        auto type = 
std::unique_ptr<Type>(Type::buildTypeFromString("map<int,float>"));
+        WriterOptions options;
+        options.setMemoryPool(pool);
+        auto writer = createWriter(*type, &memStream, options);
+        auto batch = writer->createRowBatch(rowCount * 10);
+        auto& mapBatch = dynamic_cast<MapVectorBatch&>(*batch);
+        int64_t* offsets = mapBatch.offsets.data();
+        auto& keyBatch = dynamic_cast<LongVectorBatch&>(*(mapBatch.keys));
+        auto& valueBatch = 
dynamic_cast<DoubleVectorBatch&>(*(mapBatch.elements));
+
+        mapBatch.numElements = rowCount;
+        uint64_t Offset = 0;
+
+        for (size_t i = 0; i < rowCount; ++i) {
+            offsets[i] = static_cast<int64_t>(Offset);
+            for (int j = 0; j < i / 2; j++) {
+                keyBatch.data[Offset] = i * 100;
+                valueBatch.data[Offset] = i * 3.;
+                Offset++;
+            }
+        }
+        offsets[rowCount] = static_cast<int64_t>(Offset);
+
+        keyBatch.numElements = Offset;
+        valueBatch.numElements = Offset;
+
+        std::string line;
+        std::unique_ptr<orc::ColumnPrinter> printer = 
orc::createColumnPrinter(line, type.get());
+        printer->reset(mapBatch);
+
+        for (int i = 0; i < rowCount; i++) {
+            line.clear();
+
+            printer->printRow(i);
+            std::cout << "line = " << line << "\n";
+        }
+
+        TFileScanRangeParams params;
+        TFileRangeDesc range;
+        auto reader = OrcReader::create_unique(params, range, "", nullptr, 
true);
+
+        auto doris_struct_type = 
std::make_shared<DataTypeMap>(std::make_shared<DataTypeInt32>(),
+                                                               
std::make_shared<DataTypeFloat32>());
+        MutableColumnPtr doris_column = 
doris_struct_type->create_column()->assume_mutable();
+
+        Status status = reader->_fill_doris_data_column<false>(
+                "test", doris_column, doris_struct_type, type.get(), 
&mapBatch, rowCount);
+
+        ASSERT_TRUE(status.ok());
+
+        Block block {std::vector<ColumnWithTypeAndName> {
+                {doris_column->get_ptr(), doris_struct_type, "cc"}}};
+        std::cout << block.dump_data() << "\n";
+        ASSERT_EQ(block.dump_data(),
+                  "+-----------------------+\n"
+                  "|cc(Map(Int32, Float32))|\n"
+                  "+-----------------------+\n"
+                  "|                     {}|\n"
+                  "|                     {}|\n"
+                  "|                {200:6}|\n"
+                  "|                {300:9}|\n"
+                  "|       {400:12, 400:12}|\n"
+                  "|       {500:15, 500:15}|\n"
+                  "|{600:18, 600:18, 600...|\n"
+                  "|{700:21, 700:21, 700...|\n"
+                  "|{800:24, 800:24, 800...|\n"
+                  "|{900:27, 900:27, 900...|\n"
+                  "+-----------------------+\n");
+    }
+}
+} // namespace vectorized
+} // namespace doris
\ No newline at end of file
diff --git a/be/test/vec/exec/orc/orc_reader_init_column_test.cpp 
b/be/test/vec/exec/orc/orc_reader_init_column_test.cpp
new file mode 100644
index 00000000000..44cc9cdfc59
--- /dev/null
+++ b/be/test/vec/exec/orc/orc_reader_init_column_test.cpp
@@ -0,0 +1,359 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "orc/ColumnPrinter.hh"
+#include "orc_memory_stream_test.h"
+#include "vec/core/types.h"
+#include "vec/exec/format/orc/vorc_reader.h"
+
+namespace doris {
+namespace vectorized {
+class OrcReaderInitColumnTest : public ::testing::Test {
+protected:
+    void SetUp() override {}
+
+    void TearDown() override {}
+};
+TEST_F(OrcReaderInitColumnTest, InitReadColumn) {
+    {
+        using namespace orc;
+        size_t rowCount = 10;
+        MemoryOutputStream memStream(100 * 1024 * 1024);
+        MemoryPool* pool = getDefaultPool();
+        auto type = 
std::unique_ptr<Type>(Type::buildTypeFromString("struct<col1:int,col2:int>"));
+        WriterOptions options;
+        options.setMemoryPool(pool);
+        auto writer = createWriter(*type, &memStream, options);
+        auto batch = writer->createRowBatch(rowCount);
+        writer->add(*batch);
+        writer->close();
+
+        auto inStream =
+                std::make_unique<MemoryInputStream>(memStream.getData(), 
memStream.getLength());
+        ReaderOptions readerOptions;
+        readerOptions.setMemoryPool(*pool);
+        auto orc_reader = createReader(std::move(inStream), readerOptions);
+
+        TFileScanRangeParams params;
+        TFileRangeDesc range;
+        auto reader = OrcReader::create_unique(params, range, "", nullptr, 
true);
+        reader->_reader = std::move(orc_reader);
+        std::vector<std::string> tmp;
+        tmp.emplace_back("col1");
+
+        reader->_column_names = &tmp;
+        Status st = reader->_init_read_columns();
+        std::cout << "st =" << st << "\n";
+        std::list<std::string> ans;
+        ans.emplace_back("col1");
+        ASSERT_EQ(ans, reader->_read_cols);
+    }
+
+    {
+        using namespace orc;
+        size_t rowCount = 10;
+        MemoryOutputStream memStream(100 * 1024 * 1024);
+        MemoryPool* pool = getDefaultPool();
+        auto type = 
std::unique_ptr<Type>(Type::buildTypeFromString("struct<col1:int,col2:int>"));
+        WriterOptions options;
+        options.setMemoryPool(pool);
+        auto writer = createWriter(*type, &memStream, options);
+        auto batch = writer->createRowBatch(rowCount);
+        writer->add(*batch);
+        writer->close();
+
+        auto inStream =
+                std::make_unique<MemoryInputStream>(memStream.getData(), 
memStream.getLength());
+        ReaderOptions readerOptions;
+        readerOptions.setMemoryPool(*pool);
+        auto orc_reader = createReader(std::move(inStream), readerOptions);
+
+        TFileScanRangeParams params;
+        params.slot_name_to_schema_pos.insert({"xxxxx", 0});
+        params.__isset.slot_name_to_schema_pos = true;
+        TFileRangeDesc range;
+        auto reader = OrcReader::create_unique(params, range, "", nullptr, 
true);
+        reader->_reader = std::move(orc_reader);
+        reader->_is_hive1_orc_or_use_idx = true;
+        std::vector<std::string> column_names;
+        column_names.emplace_back("xxxxx");
+
+        reader->_column_names = &column_names;
+        Status st = reader->_init_read_columns();
+
+        std::cout << "st =" << st << "\n";
+
+        std::list<std::string> ans;
+        ans.emplace_back("col1");
+        ASSERT_EQ(ans, reader->_read_cols);
+    }
+    {
+        using namespace orc;
+        size_t rowCount = 10;
+        MemoryOutputStream memStream(100 * 1024 * 1024);
+        MemoryPool* pool = getDefaultPool();
+        auto type = std::unique_ptr<Type>(
+                
Type::buildTypeFromString("struct<_col0:int,_col1:int,_col2:bigint>"));
+        WriterOptions options;
+        options.setMemoryPool(pool);
+        auto writer = createWriter(*type, &memStream, options);
+        auto batch = writer->createRowBatch(rowCount);
+        writer->add(*batch);
+        writer->close();
+
+        auto inStream =
+                std::make_unique<MemoryInputStream>(memStream.getData(), 
memStream.getLength());
+        ReaderOptions readerOptions;
+        readerOptions.setMemoryPool(*pool);
+        auto orc_reader = createReader(std::move(inStream), readerOptions);
+
+        TFileScanRangeParams params;
+        params.slot_name_to_schema_pos.insert({"a", 0});
+        params.slot_name_to_schema_pos.insert({"b", 1});
+        params.slot_name_to_schema_pos.insert({"c", 2});
+
+        params.__isset.slot_name_to_schema_pos = true;
+        TFileRangeDesc range;
+        auto reader = OrcReader::create_unique(params, range, "", nullptr, 
true);
+        reader->_reader = std::move(orc_reader);
+        std::vector<std::string> column_names;
+        column_names.emplace_back("b");
+        column_names.emplace_back("c");
+
+        reader->_column_names = &column_names;
+        Status st = reader->_init_read_columns();
+
+        std::list<std::string> ans;
+        ans.emplace_back("_col1");
+        ans.emplace_back("_col2");
+        ASSERT_EQ(ans, reader->_read_cols);
+    }
+
+    {
+        using namespace orc;
+        auto acid_type = createStructType();
+        acid_type->addStructField("operation", 
createPrimitiveType(orc::TypeKind::INT));
+        acid_type->addStructField("originalTransaction", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("bucket", 
createPrimitiveType(orc::TypeKind::INT));
+        acid_type->addStructField("rowId", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("currentTransaction", 
createPrimitiveType(orc::TypeKind::LONG));
+        auto row_type = createStructType();
+        row_type->addStructField("CoL1", 
createPrimitiveType(orc::TypeKind::LONG));
+        row_type->addStructField("col2", 
createPrimitiveType(orc::TypeKind::LONG));
+        row_type->addStructField("colUMN3", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("row", std::move(row_type));
+
+        size_t rowCount = 10;
+        MemoryOutputStream memStream(100 * 1024 * 1024);
+        MemoryPool* pool = getDefaultPool();
+        WriterOptions options;
+        options.setMemoryPool(pool);
+        auto writer = createWriter(*acid_type, &memStream, options);
+        auto batch = writer->createRowBatch(rowCount);
+        writer->add(*batch);
+        writer->close();
+
+        auto inStream =
+                std::make_unique<MemoryInputStream>(memStream.getData(), 
memStream.getLength());
+        ReaderOptions readerOptions;
+        readerOptions.setMemoryPool(*pool);
+        auto orc_reader = createReader(std::move(inStream), readerOptions);
+
+        TFileScanRangeParams params;
+        TFileRangeDesc range;
+        auto reader = OrcReader::create_unique(params, range, "", nullptr, 
true);
+        reader->_reader = std::move(orc_reader);
+        std::vector<std::string> column_names;
+        column_names.emplace_back("col1");
+        column_names.emplace_back("column3");
+        reader->_column_names = &column_names;
+        reader->_is_acid = true;
+        Status st = reader->_init_read_columns();
+
+        std::list<std::string> ans;
+        ans.emplace_back("row.CoL1");
+        ans.emplace_back("row.colUMN3");
+        ASSERT_EQ(ans, reader->_read_cols);
+    }
+}
+
+TEST_F(OrcReaderInitColumnTest, CheckAcidSchemaTest) {
+    using namespace orc;
+    TFileScanRangeParams params;
+    TFileRangeDesc range;
+    auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+    // 1. Test standard ACID schema
+    {
+        // Create standard ACID structure
+        auto acid_type = createStructType();
+        acid_type->addStructField("operation", 
createPrimitiveType(orc::TypeKind::INT));
+        acid_type->addStructField("originalTransaction", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("bucket", 
createPrimitiveType(orc::TypeKind::INT));
+        acid_type->addStructField("rowId", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("currentTransaction", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("row", createStructType());
+
+        ASSERT_TRUE(_reader->_check_acid_schema(*acid_type));
+    }
+
+    // 2. Test case-insensitive field names
+    {
+        auto acid_type = createStructType();
+        acid_type->addStructField("OPERATION", 
createPrimitiveType(orc::TypeKind::INT));
+        acid_type->addStructField("OriginalTransaction", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("Bucket", 
createPrimitiveType(orc::TypeKind::INT));
+        acid_type->addStructField("ROWID", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("currentTRANSACTION", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("ROW", createStructType());
+
+        ASSERT_TRUE(_reader->_check_acid_schema(*acid_type));
+    }
+
+    // 3. Test non-ACID schema - field count mismatch
+    {
+        auto non_acid_type = createStructType();
+        non_acid_type->addStructField("operation", 
createPrimitiveType(orc::TypeKind::INT));
+        non_acid_type->addStructField("originalTransaction",
+                                      
createPrimitiveType(orc::TypeKind::LONG));
+        // Only added two fields
+
+        ASSERT_FALSE(_reader->_check_acid_schema(*non_acid_type));
+    }
+
+    // 4. Test non-ACID schema - field name mismatch
+    {
+        auto wrong_name_type = createStructType();
+        wrong_name_type->addStructField("operation", 
createPrimitiveType(orc::TypeKind::INT));
+        wrong_name_type->addStructField("wrongName", 
createPrimitiveType(orc::TypeKind::LONG));
+        wrong_name_type->addStructField("bucket", 
createPrimitiveType(orc::TypeKind::INT));
+        wrong_name_type->addStructField("rowId", 
createPrimitiveType(orc::TypeKind::LONG));
+        wrong_name_type->addStructField("currentTransaction",
+                                        
createPrimitiveType(orc::TypeKind::LONG));
+        wrong_name_type->addStructField("row", createStructType());
+
+        ASSERT_FALSE(_reader->_check_acid_schema(*wrong_name_type));
+    }
+
+    // 5. Test non-struct type
+    {
+        auto int_type = createPrimitiveType(orc::TypeKind::INT);
+        ASSERT_FALSE(_reader->_check_acid_schema(*int_type));
+
+        auto string_type = createPrimitiveType(orc::TypeKind::STRING);
+        ASSERT_FALSE(_reader->_check_acid_schema(*string_type));
+    }
+}
+
+TEST_F(OrcReaderInitColumnTest, RemoveAcidTest) {
+    using namespace orc;
+    TFileScanRangeParams params;
+    TFileRangeDesc range;
+    auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
+    // 1. Test removing ACID info from ACID schema
+    {
+        // Create ACID schema
+        auto acid_type = createStructType();
+        acid_type->addStructField("operation", 
createPrimitiveType(orc::TypeKind::INT));
+        acid_type->addStructField("originalTransaction", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("bucket", 
createPrimitiveType(orc::TypeKind::INT));
+        acid_type->addStructField("rowId", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("currentTransaction", 
createPrimitiveType(orc::TypeKind::LONG));
+
+        // Create actual data structure
+        auto row_type = createStructType();
+        row_type->addStructField("id", 
createPrimitiveType(orc::TypeKind::INT));
+        row_type->addStructField("name", 
createPrimitiveType(orc::TypeKind::STRING));
+        acid_type->addStructField("row", std::move(row_type));
+
+        // Verify that after removing ACID we get the type of the row field
+        const orc::Type& removed_type = _reader->_remove_acid(*acid_type);
+        ASSERT_EQ(removed_type.getKind(), orc::TypeKind::STRUCT);
+        ASSERT_EQ(removed_type.getSubtypeCount(), 2); // id and name fields
+        ASSERT_EQ(removed_type.getFieldName(0), "id");
+        ASSERT_EQ(removed_type.getFieldName(1), "name");
+    }
+
+    // 2. Test that non-ACID schema remains unchanged
+    {
+        // Create normal schema
+        auto normal_type = createStructType();
+        normal_type->addStructField("field1", 
createPrimitiveType(orc::TypeKind::INT));
+        normal_type->addStructField("field2", 
createPrimitiveType(orc::TypeKind::STRING));
+
+        const orc::Type& result_type = _reader->_remove_acid(*normal_type);
+        ASSERT_EQ(&result_type, normal_type.get()); // Should return the same 
type
+        ASSERT_EQ(result_type.getSubtypeCount(), 2);
+        ASSERT_EQ(result_type.getFieldName(0), "field1");
+        ASSERT_EQ(result_type.getFieldName(1), "field2");
+    }
+
+    // 3. Test primitive types (non-struct) remain unchanged
+    {
+        auto int_type = createPrimitiveType(orc::TypeKind::INT);
+        const orc::Type& result_type = _reader->_remove_acid(*int_type);
+        ASSERT_EQ(&result_type, int_type.get());
+        ASSERT_EQ(result_type.getKind(), orc::TypeKind::INT);
+    }
+
+    // 4. Test complex nested ACID schema
+    {
+        // Create nested ACID schema
+        auto acid_type = createStructType();
+        acid_type->addStructField("operation", 
createPrimitiveType(orc::TypeKind::INT));
+        acid_type->addStructField("originalTransaction", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("bucket", 
createPrimitiveType(orc::TypeKind::INT));
+        acid_type->addStructField("rowId", 
createPrimitiveType(orc::TypeKind::LONG));
+        acid_type->addStructField("currentTransaction", 
createPrimitiveType(orc::TypeKind::LONG));
+
+        // Create complex row structure
+        auto row_type = createStructType();
+
+        // Add basic fields
+        row_type->addStructField("id", 
createPrimitiveType(orc::TypeKind::INT));
+
+        // Add array field
+        auto array_type = 
createListType(createPrimitiveType(orc::TypeKind::STRING));
+        row_type->addStructField("tags", std::move(array_type));
+
+        // Add Map field
+        auto map_type = 
createMapType(createPrimitiveType(orc::TypeKind::STRING),
+                                      createPrimitiveType(orc::TypeKind::INT));
+        row_type->addStructField("properties", std::move(map_type));
+
+        acid_type->addStructField("row", std::move(row_type));
+
+        // Verify structure after removing ACID
+        const orc::Type& removed_type = _reader->_remove_acid(*acid_type);
+        ASSERT_EQ(removed_type.getKind(), orc::TypeKind::STRUCT);
+        ASSERT_EQ(removed_type.getSubtypeCount(), 3); // id, tags, properties
+        ASSERT_EQ(removed_type.getFieldName(0), "id");
+        ASSERT_EQ(removed_type.getFieldName(1), "tags");
+        ASSERT_EQ(removed_type.getFieldName(2), "properties");
+
+        // Verify field types
+        ASSERT_EQ(removed_type.getSubtype(0)->getKind(), orc::TypeKind::INT);
+        ASSERT_EQ(removed_type.getSubtype(1)->getKind(), orc::TypeKind::LIST);
+        ASSERT_EQ(removed_type.getSubtype(2)->getKind(), orc::TypeKind::MAP);
+    }
+}
+
+} // namespace vectorized
+} // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) branch branch-2.1 updated: [feat](test)add some be ut for orc/parquet reader (#49418) (#49948)

Reply via email to