This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 6e448d3a562 [feat](test)add some be ut for orc/parquet reader (#49418) (#49948) 6e448d3a562 is described below commit 6e448d3a5625423c685449a0ff433e9789c08327 Author: daidai <changyu...@selectdb.com> AuthorDate: Wed Apr 16 12:38:45 2025 +0800 [feat](test)add some be ut for orc/parquet reader (#49418) (#49948) bp #49418 --- be/src/vec/exec/format/orc/vorc_reader.cpp | 8 +- be/test/vec/exec/orc/orc_convert_dict_test.cpp | 237 ++++++++++ .../exec/orc/orc_convert_to_orc_literal_test.cpp | 216 ++++++++++ be/test/vec/exec/orc/orc_memory_stream_test.h | 102 +++++ be/test/vec/exec/orc/orc_reader_fill_data_test.cpp | 475 +++++++++++++++++++++ .../vec/exec/orc/orc_reader_init_column_test.cpp | 359 ++++++++++++++++ 6 files changed, 1391 insertions(+), 6 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index a98e88c4173..83e3d9dfbb8 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -430,6 +430,8 @@ bool OrcReader::_check_acid_schema(const orc::Type& type) { return false; } } + } else { + return false; } return true; } @@ -1425,15 +1427,9 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, case TypeIndex::Decimal128V3: return _decode_decimal_column<Decimal128V3, is_filter>(col_name, data_column, data_type, cvb, num_values); - case TypeIndex::Date: - return _decode_time_column<VecDateTimeValue, Int64, orc::LongVectorBatch, is_filter>( - col_name, data_column, cvb, num_values); case TypeIndex::DateV2: return _decode_time_column<DateV2Value<DateV2ValueType>, UInt32, orc::LongVectorBatch, is_filter>(col_name, data_column, cvb, num_values); - case TypeIndex::DateTime: - return _decode_time_column<VecDateTimeValue, Int64, orc::TimestampVectorBatch, is_filter>( - col_name, data_column, cvb, num_values); case TypeIndex::DateTimeV2: return _decode_time_column<DateV2Value<DateTimeV2ValueType>, UInt64, orc::TimestampVectorBatch, is_filter>(col_name, data_column, cvb, diff --git a/be/test/vec/exec/orc/orc_convert_dict_test.cpp b/be/test/vec/exec/orc/orc_convert_dict_test.cpp new file mode 100644 index 00000000000..bce08cc63db --- /dev/null +++ b/be/test/vec/exec/orc/orc_convert_dict_test.cpp @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <memory> + +#include "orc/ColumnPrinter.hh" +#include "vec/columns/column_array.h" +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_struct.h" +#include "vec/exec/format/orc/vorc_reader.h" + +namespace doris { +namespace vectorized { +class OrcReaderConvertDictTest : public ::testing::Test { +protected: + void SetUp() override {} + + void TearDown() override {} +}; + +std::unique_ptr<orc::EncodedStringVectorBatch> create_encoded_string_batch( + const std::vector<std::string>& dict_values) { + auto batch = + std::make_unique<orc::EncodedStringVectorBatch>(1024 * 1024, *orc::getDefaultPool()); + batch->dictionary = std::make_unique<orc::StringDictionary>(*orc::getDefaultPool()); + + // Fill dictionary data + int sz = 0; + for (const auto& value : dict_values) { + sz += value.length(); + } + + batch->dictionary->dictionaryBlob.resize(sz + 1024); + batch->dictionary->dictionaryOffset.resize(dict_values.size() + 10); + int x = 0; + for (const auto& value : dict_values) { + batch->dictionary->dictionaryOffset[x + 1] = + batch->dictionary->dictionaryOffset[x] + value.size(); + int y = batch->dictionary->dictionaryOffset[x]; + for (auto ch : value) { + batch->dictionary->dictionaryBlob[y] = ch; + y++; + } + x++; + } + + return batch; +} + +TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnBasic) { + // Prepare dictionary data + std::vector<std::string> dict_values = {"hello", "world", "doris", "test"}; + auto string_batch = create_encoded_string_batch(dict_values); + + // Prepare dictionary index column + auto dict_column = ColumnInt32::create(); + std::vector<int32_t> indices = {0, 1, 2, + 3, 1, 0}; // "hello", "world", "doris", "test", "world", "hello" + for (auto x : indices) { + dict_column->insert(x); + } + + // Create ORC type + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING); + + TFileScanRangeParams params; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + + // Execute conversion + auto result_column = reader->_convert_dict_column_to_string_column( + dict_column.get(), nullptr, string_batch.get(), orc_type_ptr.get()); + + // Validate results + auto* string_column = assert_cast<const ColumnString*>(result_column.get()); + ASSERT_EQ(string_column->size(), 6); + ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello"); + ASSERT_EQ(string_column->get_data_at(1).to_string(), "world"); + ASSERT_EQ(string_column->get_data_at(2).to_string(), "doris"); + ASSERT_EQ(string_column->get_data_at(3).to_string(), "test"); + ASSERT_EQ(string_column->get_data_at(4).to_string(), "world"); + ASSERT_EQ(string_column->get_data_at(5).to_string(), "hello"); +} + +TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnWithNulls) { + // Prepare dictionary data + std::vector<std::string> dict_values = {"hello", "world", "doris"}; + auto string_batch = create_encoded_string_batch(dict_values); + + // Prepare dictionary index column + auto dict_column = ColumnInt32::create(); + std::vector<int32_t> indices = {0, 1, 2, 1, 0}; + for (auto x : indices) { + dict_column->insert(x); + } + + // Prepare null map + NullMap null_map = {0, 1, 0, 0, 1}; // 2nd and 5th elements are null + + // Create ORC type + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING); + + TFileScanRangeParams params; + TFileRangeDesc range; + auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + + // Execute conversion + auto result_column = _reader->_convert_dict_column_to_string_column( + dict_column.get(), &null_map, string_batch.get(), orc_type_ptr.get()); + + // Validate results + auto* string_column = assert_cast<const ColumnString*>(result_column.get()); + ASSERT_EQ(string_column->size(), 5); + ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello"); + ASSERT_EQ(string_column->get_data_at(1).to_string(), ""); // null value + ASSERT_EQ(string_column->get_data_at(2).to_string(), "doris"); + ASSERT_EQ(string_column->get_data_at(3).to_string(), "world"); + ASSERT_EQ(string_column->get_data_at(4).to_string(), ""); // null value +} + +TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnChar) { + // Prepare dictionary data (CHAR type with right-padded spaces) + std::vector<std::string> dict_values = {"hello ", "world ", "test "}; + auto string_batch = create_encoded_string_batch(dict_values); + + // Prepare dictionary index column + auto dict_column = ColumnInt32::create(); + std::vector<int32_t> indices = {0, 1, 2, 1}; + for (auto x : indices) { + dict_column->insert(x); + } + + // Create ORC CHAR type + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::CHAR); + TFileScanRangeParams params; + TFileRangeDesc range; + auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + + // Execute conversion + auto result_column = _reader->_convert_dict_column_to_string_column( + dict_column.get(), nullptr, string_batch.get(), orc_type_ptr.get()); + + // Validate results (should remove trailing spaces) + auto* string_column = assert_cast<const ColumnString*>(result_column.get()); + ASSERT_EQ(string_column->size(), 4); + ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello"); // spaces removed + ASSERT_EQ(string_column->get_data_at(1).to_string(), "world"); // spaces removed + ASSERT_EQ(string_column->get_data_at(2).to_string(), "test"); // spaces removed + ASSERT_EQ(string_column->get_data_at(3).to_string(), "world"); // spaces removed +} + +TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnEmpty) { + // Prepare empty dictionary data + std::vector<std::string> dict_values = {""}; + auto string_batch = create_encoded_string_batch(dict_values); + + // Prepare dictionary index column + auto dict_column = ColumnInt32::create(); + std::vector<int32_t> indices = {0, 0, 0}; + for (auto x : indices) { + dict_column->insert(x); + } + + // Create ORC type + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING); + TFileScanRangeParams params; + TFileRangeDesc range; + auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + // Execute conversion + auto result_column = _reader->_convert_dict_column_to_string_column( + dict_column.get(), nullptr, string_batch.get(), orc_type_ptr.get()); + + // Validate results + auto* string_column = assert_cast<const ColumnString*>(result_column.get()); + ASSERT_EQ(string_column->size(), 3); + ASSERT_EQ(string_column->get_data_at(0).to_string(), ""); + ASSERT_EQ(string_column->get_data_at(1).to_string(), ""); + ASSERT_EQ(string_column->get_data_at(2).to_string(), ""); +} + +TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnMixed) { + // Prepare mixed length dictionary data + std::vector<std::string> dict_values = {"", "a", "ab", "abc", "abcd"}; + auto string_batch = create_encoded_string_batch(dict_values); + + // Prepare dictionary index column + auto dict_column = ColumnInt32::create(); + std::vector<int32_t> indices = {0, 1, 2, 3, 4, 2, 1, 0}; + for (auto x : indices) { + dict_column->insert(x); + } + + // Prepare partial null values + NullMap null_map = {0, 0, 1, 0, 0, 1, 0, 0}; + + // Create ORC type + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING); + TFileScanRangeParams params; + TFileRangeDesc range; + auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + // Execute conversion + auto result_column = _reader->_convert_dict_column_to_string_column( + dict_column.get(), &null_map, string_batch.get(), orc_type_ptr.get()); + + // Validate results + auto* string_column = assert_cast<const ColumnString*>(result_column.get()); + ASSERT_EQ(string_column->size(), 8); + ASSERT_EQ(string_column->get_data_at(0).to_string(), ""); + ASSERT_EQ(string_column->get_data_at(1).to_string(), "a"); + ASSERT_EQ(string_column->get_data_at(2).to_string(), ""); // null + ASSERT_EQ(string_column->get_data_at(3).to_string(), "abc"); + ASSERT_EQ(string_column->get_data_at(4).to_string(), "abcd"); + ASSERT_EQ(string_column->get_data_at(5).to_string(), ""); // null + ASSERT_EQ(string_column->get_data_at(6).to_string(), "a"); + ASSERT_EQ(string_column->get_data_at(7).to_string(), ""); +} + +} // namespace vectorized + +} // namespace doris diff --git a/be/test/vec/exec/orc/orc_convert_to_orc_literal_test.cpp b/be/test/vec/exec/orc/orc_convert_to_orc_literal_test.cpp new file mode 100644 index 00000000000..ac79f22a6bb --- /dev/null +++ b/be/test/vec/exec/orc/orc_convert_to_orc_literal_test.cpp @@ -0,0 +1,216 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <memory> + +#include "orc/ColumnPrinter.hh" +#include "vec/columns/column_struct.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_nullable.h" +#include "vec/exec/format/orc/vorc_reader.cpp" + +namespace doris { +namespace vectorized { +class OrcReaderConvertToOrcLiteralTest : public ::testing::Test { +protected: + void SetUp() override {} + + void TearDown() override {} +}; + +TEST_F(OrcReaderConvertToOrcLiteralTest, ConvertTypesTest) { + // TINYINT test + { + int8_t tiny_value = 127; + StringRef literal_data(reinterpret_cast<char*>(&tiny_value), sizeof(tiny_value)); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::BYTE); + auto [success, literal] = + convert_to_orc_literal<TYPE_TINYINT>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); + ASSERT_EQ(literal.getLong(), 127); + } + + // SMALLINT test + { + int16_t small_value = 32000; + StringRef literal_data(reinterpret_cast<char*>(&small_value), sizeof(small_value)); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::SHORT); + auto [success, literal] = + convert_to_orc_literal<TYPE_SMALLINT>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); + ASSERT_EQ(literal.getLong(), 32000); + } + + // INT test + { + int32_t int_value = 2147483647; + StringRef literal_data(reinterpret_cast<char*>(&int_value), sizeof(int_value)); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::INT); + auto [success, literal] = + convert_to_orc_literal<TYPE_INT>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); + ASSERT_EQ(literal.getLong(), 2147483647); + } + + // BIGINT test + { + int64_t big_value = 9223372036854775807LL; + StringRef literal_data(reinterpret_cast<char*>(&big_value), sizeof(big_value)); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::LONG); + auto [success, literal] = + convert_to_orc_literal<TYPE_BIGINT>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); + ASSERT_EQ(literal.getLong(), 9223372036854775807LL); + } + // FLOAT test + { + float float_value = 3.14159f; + StringRef literal_data(reinterpret_cast<char*>(&float_value), sizeof(float_value)); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::FLOAT); + auto [success, literal] = + convert_to_orc_literal<TYPE_FLOAT>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); + ASSERT_NEAR(literal.getFloat(), 3.14159f, 0.0001); + } + + // DOUBLE test + { + double double_value = 3.14159265358979323846; + StringRef literal_data(reinterpret_cast<char*>(&double_value), sizeof(double_value)); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DOUBLE); + auto [success, literal] = + convert_to_orc_literal<TYPE_DOUBLE>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); + ASSERT_DOUBLE_EQ(literal.getFloat(), 3.14159265358979323846); + } + // STRING test + { + std::string str_value = "Hello, World!"; + StringRef literal_data(str_value.data(), str_value.size()); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING); + auto [success, literal] = + convert_to_orc_literal<TYPE_STRING>(orc_type_ptr.get(), (void*)&literal_data, 0, 0); + ASSERT_TRUE(success); + ASSERT_EQ(std::string(literal.getString().data(), literal.getString().length()), + "Hello, World!"); + } + + // DECIMAL32 test + { + int32_t decimal32_value = 12345; + StringRef literal_data(reinterpret_cast<const char*>(&decimal32_value), + sizeof(decimal32_value)); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DECIMAL); + auto [success, literal] = + convert_to_orc_literal<TYPE_DECIMAL32>(orc_type_ptr.get(), literal_data.data, 9, 4); + ASSERT_TRUE(success); + ASSERT_EQ(literal.getDecimal().toString(), "1.2345"); + } + + // DECIMAL64 test + { + int64_t decimal64_value = 123456789012345LL; + StringRef literal_data(reinterpret_cast<const char*>(&decimal64_value), + sizeof(decimal64_value)); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DECIMAL); + auto [success, literal] = convert_to_orc_literal<TYPE_DECIMAL64>(orc_type_ptr.get(), + literal_data.data, 18, 6); + ASSERT_TRUE(success); + ASSERT_EQ(literal.getDecimal().toString(), "123456789.012345"); + } + + // DECIMAL128 test + { + int128_t decimal128_value = 1234512345; + StringRef literal_data(reinterpret_cast<const char*>(&decimal128_value), + sizeof(decimal128_value)); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DECIMAL); + auto [success, literal] = convert_to_orc_literal<TYPE_DECIMAL128I>( + orc_type_ptr.get(), literal_data.data, 38, 9); + ASSERT_TRUE(success); + ASSERT_EQ(literal.getDecimal().toString(), "1.234512345"); + } + + { + // Normal date + VecDateTimeValue date_value; + date_value.from_date_str("2024-03-14", 10); + StringRef literal_data(reinterpret_cast<const char*>(&date_value), sizeof(date_value)); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::DATE); + auto [success, literal] = + convert_to_orc_literal<TYPE_DATE>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); + + // Verify converted day offset + int64_t expected_days = 19796; // Day count for 2024-03-14 + ASSERT_EQ(literal.getDate(), expected_days); + + // Boundary date - minimum value + date_value.from_date_str("0001-01-01", 10); + literal_data = StringRef(reinterpret_cast<const char*>(&date_value), sizeof(date_value)); + std::tie(success, literal) = + convert_to_orc_literal<TYPE_DATE>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); //-719162 + ASSERT_EQ(literal.getDate(), -719162); + + // Boundary date - maximum value + date_value.from_date_str("9999-12-31", 10); + literal_data = StringRef(reinterpret_cast<const char*>(&date_value), sizeof(date_value)); + std::tie(success, literal) = + convert_to_orc_literal<TYPE_DATE>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); // + ASSERT_EQ(literal.getDate(), 2932896); + } + + // DATETIME type test + { + // Normal timestamp + VecDateTimeValue datetime_value; + datetime_value.from_date_str("2024-03-14 15:30:45", 19); + StringRef literal_data(reinterpret_cast<const char*>(&datetime_value), + sizeof(datetime_value)); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::TIMESTAMP); + auto [success, literal] = + convert_to_orc_literal<TYPE_DATETIME>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); + + // Verify seconds and nanoseconds + ASSERT_EQ(literal.getTimestamp().getMillis(), 1710430245000); // + + // Midnight time + datetime_value.from_date_str("2024-03-14 00:00:00", 19); + literal_data = + StringRef(reinterpret_cast<const char*>(&datetime_value), sizeof(datetime_value)); + std::tie(success, literal) = + convert_to_orc_literal<TYPE_DATETIME>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); + ASSERT_EQ(literal.getTimestamp().getMillis(), 1710374400000); // + + // Leap year handling + datetime_value.from_date_str("2024-02-29 12:00:00", 19); + literal_data = + StringRef(reinterpret_cast<const char*>(&datetime_value), sizeof(datetime_value)); + std::tie(success, literal) = + convert_to_orc_literal<TYPE_DATETIME>(orc_type_ptr.get(), literal_data.data, 0, 0); + ASSERT_TRUE(success); + ASSERT_EQ(literal.getTimestamp().getMillis(), 1709208000000); // + } +} +} // namespace vectorized +} // namespace doris diff --git a/be/test/vec/exec/orc/orc_memory_stream_test.h b/be/test/vec/exec/orc/orc_memory_stream_test.h new file mode 100644 index 00000000000..52c9daad591 --- /dev/null +++ b/be/test/vec/exec/orc/orc_memory_stream_test.h @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <memory> + +#include "orc/ColumnPrinter.hh" +#include "vec/columns/column_array.h" +#include "vec/columns/column_map.h" +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_struct.h" +#include "vec/columns/columns_number.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_date.h" +#include "vec/data_types/data_type_date_time.h" +#include "vec/data_types/data_type_decimal.h" +#include "vec/data_types/data_type_map.h" +#include "vec/data_types/data_type_nullable.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" +#include "vec/data_types/data_type_struct.h" +#include "vec/exec/format/orc/vorc_reader.h" + +namespace doris { +namespace vectorized { + +class MemoryOutputStream : public orc::OutputStream { +public: + MemoryOutputStream(size_t capacity) : name("MemoryOutputStream") { + data = new char[capacity]; + length = 0; + naturalWriteSize = 2048; + } + + virtual ~MemoryOutputStream() override { delete[] data; }; + + virtual uint64_t getLength() const override { return length; } + + virtual uint64_t getNaturalWriteSize() const override { return naturalWriteSize; } + + virtual void write(const void* buf, size_t size) override { + memcpy(data + length, buf, size); + length += size; + } + + virtual const std::string& getName() const override { return name; } + + const char* getData() const { return data; } + + void close() override {} + +private: + char* data; + std::string name; + uint64_t length, naturalWriteSize; +}; + +class MemoryInputStream : public orc::InputStream { +public: + MemoryInputStream(const char* _buffer, size_t _size) + : buffer(_buffer), size(_size), naturalReadSize(1024), name("MemoryInputStream") {} + + ~MemoryInputStream() override {} + + virtual uint64_t getLength() const override { return size; } + + virtual uint64_t getNaturalReadSize() const override { return naturalReadSize; } + + virtual void read(void* buf, uint64_t length, uint64_t offset) override { + memcpy(buf, buffer + offset, length); + } + + virtual const std::string& getName() const override { return name; } + + // const char* getData() const { + // return buffer; + // } + +private: + const char* buffer; + uint64_t size, naturalReadSize; + std::string name; +}; +} // namespace vectorized +} // namespace doris \ No newline at end of file diff --git a/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp b/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp new file mode 100644 index 00000000000..d896419a338 --- /dev/null +++ b/be/test/vec/exec/orc/orc_reader_fill_data_test.cpp @@ -0,0 +1,475 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <memory> + +#include "orc/ColumnPrinter.hh" +#include "orc_memory_stream_test.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_struct.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_decimal.h" +#include "vec/data_types/data_type_map.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_struct.h" +#include "vec/exec/format/orc/vorc_reader.h" + +namespace doris { +namespace vectorized { +class OrcReaderFillDataTest : public ::testing::Test { +protected: + void SetUp() override {} + + void TearDown() override {} +}; + +std::unique_ptr<orc::LongVectorBatch> create_long_batch(size_t size, + const std::vector<int64_t>& values, + const std::vector<bool>& nulls = {}) { + auto batch = std::make_unique<orc::LongVectorBatch>(size, *orc::getDefaultPool()); + batch->resize(size); + batch->notNull.resize(size); + + bool has_nulls = nulls.size() == size; + for (size_t i = 0; i < size; ++i) { + if (has_nulls) { + batch->notNull[i] = !nulls[i]; + } else { + batch->notNull[i] = true; + } + + if (batch->notNull[i]) { + batch->data[i] = values[i]; + } + } + + if (has_nulls) { + batch->hasNulls = true; + } else { + batch->hasNulls = false; + } + return batch; +} + +TEST_F(OrcReaderFillDataTest, TestFillLongColumn) { + std::vector<int64_t> values = {1, 2, 3, 4, 5}; + auto batch = create_long_batch(values.size(), values); + auto column = ColumnInt64::create(); + auto data_type = std::make_shared<DataTypeInt64>(); + + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::LONG); + + TFileScanRangeParams params; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + + MutableColumnPtr xx = column->assume_mutable(); + + Status status = reader->_fill_doris_data_column<false>( + "test_long", xx, data_type, orc_type_ptr.get(), batch.get(), values.size()); + + ASSERT_TRUE(status.ok()); + ASSERT_EQ(column->size(), values.size()); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(column->get_int(i), values[i]); + } +} + +TEST_F(OrcReaderFillDataTest, TestFillLongColumnWithNull) { + std::vector<int64_t> values = {1, 2, 3, 4, 5}; + std::vector<bool> nulls = {false, true, false, true, false}; + auto batch = create_long_batch(values.size(), values, nulls); + auto column = ColumnInt64::create(); + auto data_type = std::make_shared<DataTypeInt64>(); + + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::LONG); + + TFileScanRangeParams params; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + + MutableColumnPtr xx = column->assume_mutable(); + + Status status = reader->_fill_doris_data_column<false>( + "test_long_with_null", xx, data_type, orc_type_ptr.get(), batch.get(), values.size()); + + ASSERT_TRUE(status.ok()); + ASSERT_EQ(column->size(), values.size()); + + for (size_t i = 0; i < values.size(); ++i) { + if (!nulls[i]) { + ASSERT_EQ(column->get_int(i), values[i]); + } + } +} + +TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) { + // Array类型测试 + { + using namespace orc; + std::unique_ptr<orc::Type> type(orc::Type::buildTypeFromString("struct<col1:array<int>>")); + + WriterOptions options; + options.setMemoryPool(orc::getDefaultPool()); + + MemoryOutputStream memStream(100 * 1024 * 1024); + std::unique_ptr<orc::Writer> writer = orc::createWriter(*type, &memStream, options); + + std::unique_ptr<orc::ColumnVectorBatch> batch = writer->createRowBatch(1024); + orc::StructVectorBatch* structBatch = dynamic_cast<orc::StructVectorBatch*>(batch.get()); + orc::ListVectorBatch* listBatch = + dynamic_cast<orc::ListVectorBatch*>(structBatch->fields[0]); + orc::LongVectorBatch* intBatch = + dynamic_cast<orc::LongVectorBatch*>(listBatch->elements.get()); + int64_t* data = intBatch->data.data(); + int64_t* offsets = listBatch->offsets.data(); + uint64_t rowCount = 20; + uint64_t offset = 0; + uint64_t maxListLength = 5; + for (uint64_t i = 0; i < rowCount; ++i) { + offsets[i] = static_cast<int64_t>(offset); + for (uint64_t length = i % maxListLength + 1; length != 0; --length) { + data[offset++] = static_cast<int64_t>(i); + } + } + offsets[rowCount] = static_cast<int64_t>(offset); + + structBatch->numElements = rowCount; + listBatch->numElements = rowCount; + + TFileScanRangeParams params; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + + auto doris_struct_type = std::make_shared<DataTypeStruct>( + std::vector<DataTypePtr> { + std::make_shared<DataTypeArray>(std::make_shared<DataTypeInt32>())}, + std::vector<std::string> {"col1"}); + MutableColumnPtr doris_column = doris_struct_type->create_column()->assume_mutable(); + + Status status = reader->_fill_doris_data_column<false>( + "test", doris_column, doris_struct_type, type.get(), structBatch, rowCount); + + ASSERT_TRUE(status.ok()); + std::string line; + std::unique_ptr<orc::ColumnPrinter> printer = orc::createColumnPrinter(line, type.get()); + printer->reset(*structBatch); + + for (int i = 0; i < rowCount; i++) { + line.clear(); + printer->printRow(i); + std::cout << "line = " << line << "\n"; + } + Block block {std::vector<ColumnWithTypeAndName> { + {doris_column->get_ptr(), doris_struct_type, "cc"}}}; + std::cout << block.dump_data() << "\n"; + + ASSERT_EQ(block.dump_data(), + "+-----------------------------+\n" + "|cc(Struct(col1:Array(Int32)))|\n" + "+-----------------------------+\n" + "| {[0]}|\n" + "| {[1, 1]}|\n" + "| {[2, 2, 2]}|\n" + "| {[3, 3, 3, 3]}|\n" + "| {[4, 4, 4, 4, 4]}|\n" + "| {[5]}|\n" + "| {[6, 6]}|\n" + "| {[7, 7, 7]}|\n" + "| {[8, 8, 8, 8]}|\n" + "| {[9, 9, 9, 9, 9]}|\n" + "| {[10]}|\n" + "| {[11, 11]}|\n" + "| {[12, 12, 12]}|\n" + "| {[13, 13, 13, 13]}|\n" + "| {[14, 14, 14, 14, 14]}|\n" + "| {[15]}|\n" + "| {[16, 16]}|\n" + "| {[17, 17, 17]}|\n" + "| {[18, 18, 18, 18]}|\n" + "| {[19, 19, 19, 19, 19]}|\n" + "+-----------------------------+\n"); + } + + { + using namespace orc; + auto type = std::unique_ptr<Type>(Type::buildTypeFromString("struct<col1:int,col2:int>")); + + size_t rowCount = 10; + MemoryOutputStream memStream(100 * 1024 * 1024); + WriterOptions options; + options.setMemoryPool(getDefaultPool()); + auto writer = createWriter(*type, &memStream, options); + auto batch = writer->createRowBatch(rowCount); + auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch); + auto& longBatch1 = dynamic_cast<LongVectorBatch&>(*structBatch.fields[0]); + auto& longBatch2 = dynamic_cast<LongVectorBatch&>(*structBatch.fields[1]); + structBatch.numElements = rowCount; + longBatch1.numElements = rowCount; + longBatch2.numElements = rowCount; + for (size_t i = 0; i < rowCount; ++i) { + longBatch1.data[i] = static_cast<int64_t>(i * 100); + longBatch2.data[i] = static_cast<int64_t>(i * 300); + } + + std::string line; + std::unique_ptr<orc::ColumnPrinter> printer = orc::createColumnPrinter(line, type.get()); + printer->reset(structBatch); + + for (int i = 0; i < rowCount; i++) { + line.clear(); + printer->printRow(i); + std::cout << "line = " << line << "\n"; + } + + TFileScanRangeParams params; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + + auto doris_struct_type = std::make_shared<DataTypeStruct>( + std::vector<DataTypePtr> {std::make_shared<DataTypeInt32>(), + std::make_shared<DataTypeInt32>()}, + std::vector<std::string> {"col1", "col2"}); + MutableColumnPtr doris_column = doris_struct_type->create_column()->assume_mutable(); + + Status status = reader->_fill_doris_data_column<false>( + "test", doris_column, doris_struct_type, type.get(), &structBatch, rowCount); + + ASSERT_TRUE(status.ok()); + + Block block {std::vector<ColumnWithTypeAndName> { + {doris_column->get_ptr(), doris_struct_type, "cc"}}}; + std::cout << block.dump_data() << "\n"; + + ASSERT_EQ(block.dump_data(), + "+----------------------------------+\n" + "|cc(Struct(col1:Int32, col2:Int32))|\n" + "+----------------------------------+\n" + "| {0, 0}|\n" + "| {100, 300}|\n" + "| {200, 600}|\n" + "| {300, 900}|\n" + "| {400, 1200}|\n" + "| {500, 1500}|\n" + "| {600, 1800}|\n" + "| {700, 2100}|\n" + "| {800, 2400}|\n" + "| {900, 2700}|\n" + "+----------------------------------+\n"); + } + + { + using namespace orc; + + const uint64_t maxPrecision = 18; + MemoryOutputStream memStream(100 * 1024 * 102); + MemoryPool* pool = getDefaultPool(); + std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:decimal(18,5)>")); + WriterOptions options; + options.setMemoryPool(pool); + + uint64_t rowCount = 5; + std::unique_ptr<Writer> writer = createWriter(*type, &memStream, options); + std::unique_ptr<ColumnVectorBatch> batch = + writer->createRowBatch(2 * rowCount + 2 * maxPrecision); + StructVectorBatch* structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); + Decimal64VectorBatch* decBatch = + dynamic_cast<Decimal64VectorBatch*>(structBatch->fields[0]); + decBatch->scale = 5; + decBatch->precision = 18; + // write positive decimals + for (uint64_t i = 0; i < rowCount; ++i) { + decBatch->values[i] = static_cast<int64_t>(i + 10000); + } + + // write negative decimals + for (uint64_t i = rowCount; i < 2 * rowCount; ++i) { + decBatch->values[i] = static_cast<int64_t>(i - 10000); + } + + // write all precision decimals + int64_t dec = 0; + for (uint64_t i = 2 * rowCount; i < 2 * rowCount + 2 * maxPrecision; i += 2) { + dec = dec * 10 + 9; + decBatch->values[i] = dec; + decBatch->values[i + 1] = -dec; + } + rowCount = 2 * (rowCount + maxPrecision); + structBatch->numElements = decBatch->numElements = rowCount; + + std::string line; + std::unique_ptr<orc::ColumnPrinter> printer = orc::createColumnPrinter(line, type.get()); + printer->reset(*structBatch); + + for (int i = 0; i < rowCount; i++) { + line.clear(); + printer->printRow(i); + std::cout << "line = " << line << "\n"; + } + + TFileScanRangeParams params; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + + auto doris_struct_type = std::make_shared<DataTypeStruct>( + std::vector<DataTypePtr> {std::make_shared<DataTypeDecimal<Decimal64>>(18, 5)}, + std::vector<std::string> {"col1"}); + MutableColumnPtr doris_column = doris_struct_type->create_column()->assume_mutable(); + reader->_decimal_scale_params.resize(0); + reader->_decimal_scale_params_index = 0; + Status status = reader->_fill_doris_data_column<false>( + "test", doris_column, doris_struct_type, type.get(), structBatch, rowCount); + + ASSERT_TRUE(status.ok()); + + Block block {std::vector<ColumnWithTypeAndName> { + {doris_column->get_ptr(), doris_struct_type, "cc"}}}; + std::cout << block.dump_data() << "\n"; + ASSERT_EQ(block.dump_data(), + "+-------------------------------+\n" + "|cc(Struct(col1:Decimal(18, 5)))|\n" + "+-------------------------------+\n" + "| {0.10000}|\n" + "| {0.10001}|\n" + "| {0.10002}|\n" + "| {0.10003}|\n" + "| {0.10004}|\n" + "| {-0.09995}|\n" + "| {-0.09994}|\n" + "| {-0.09993}|\n" + "| {-0.09992}|\n" + "| {-0.09991}|\n" + "| {0.00009}|\n" + "| {-0.00009}|\n" + "| {0.00099}|\n" + "| {-0.00099}|\n" + "| {0.00999}|\n" + "| {-0.00999}|\n" + "| {0.09999}|\n" + "| {-0.09999}|\n" + "| {0.99999}|\n" + "| {-0.99999}|\n" + "| {9.99999}|\n" + "| {-9.99999}|\n" + "| {99.99999}|\n" + "| {-99.99999}|\n" + "| {999.99999}|\n" + "| {-999.99999}|\n" + "| {9999.99999}|\n" + "| {-9999.99999}|\n" + "| {99999.99999}|\n" + "| {-99999.99999}|\n" + "| {999999.99999}|\n" + "| {-999999.99999}|\n" + "| {9999999.99999}|\n" + "| {-9999999.99999}|\n" + "| {99999999.99999}|\n" + "| {-99999999.99999}|\n" + "| {999999999.99999}|\n" + "| {-999999999.99999}|\n" + "| {9999999999.99999}|\n" + "| {-9999999999.99999}|\n" + "| {99999999999.99999}|\n" + "| {-99999999999.99999}|\n" + "| {999999999999.99999}|\n" + "| {-999999999999.99999}|\n" + "| {9999999999999.99999}|\n" + "| {-9999999999999.99999}|\n" + "+-------------------------------+\n"); + } + + { + using namespace orc; + size_t rowCount = 10; + MemoryOutputStream memStream(100 * 1024 * 1024); + MemoryPool* pool = getDefaultPool(); + auto type = std::unique_ptr<Type>(Type::buildTypeFromString("map<int,float>")); + WriterOptions options; + options.setMemoryPool(pool); + auto writer = createWriter(*type, &memStream, options); + auto batch = writer->createRowBatch(rowCount * 10); + auto& mapBatch = dynamic_cast<MapVectorBatch&>(*batch); + int64_t* offsets = mapBatch.offsets.data(); + auto& keyBatch = dynamic_cast<LongVectorBatch&>(*(mapBatch.keys)); + auto& valueBatch = dynamic_cast<DoubleVectorBatch&>(*(mapBatch.elements)); + + mapBatch.numElements = rowCount; + uint64_t Offset = 0; + + for (size_t i = 0; i < rowCount; ++i) { + offsets[i] = static_cast<int64_t>(Offset); + for (int j = 0; j < i / 2; j++) { + keyBatch.data[Offset] = i * 100; + valueBatch.data[Offset] = i * 3.; + Offset++; + } + } + offsets[rowCount] = static_cast<int64_t>(Offset); + + keyBatch.numElements = Offset; + valueBatch.numElements = Offset; + + std::string line; + std::unique_ptr<orc::ColumnPrinter> printer = orc::createColumnPrinter(line, type.get()); + printer->reset(mapBatch); + + for (int i = 0; i < rowCount; i++) { + line.clear(); + + printer->printRow(i); + std::cout << "line = " << line << "\n"; + } + + TFileScanRangeParams params; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + + auto doris_struct_type = std::make_shared<DataTypeMap>(std::make_shared<DataTypeInt32>(), + std::make_shared<DataTypeFloat32>()); + MutableColumnPtr doris_column = doris_struct_type->create_column()->assume_mutable(); + + Status status = reader->_fill_doris_data_column<false>( + "test", doris_column, doris_struct_type, type.get(), &mapBatch, rowCount); + + ASSERT_TRUE(status.ok()); + + Block block {std::vector<ColumnWithTypeAndName> { + {doris_column->get_ptr(), doris_struct_type, "cc"}}}; + std::cout << block.dump_data() << "\n"; + ASSERT_EQ(block.dump_data(), + "+-----------------------+\n" + "|cc(Map(Int32, Float32))|\n" + "+-----------------------+\n" + "| {}|\n" + "| {}|\n" + "| {200:6}|\n" + "| {300:9}|\n" + "| {400:12, 400:12}|\n" + "| {500:15, 500:15}|\n" + "|{600:18, 600:18, 600...|\n" + "|{700:21, 700:21, 700...|\n" + "|{800:24, 800:24, 800...|\n" + "|{900:27, 900:27, 900...|\n" + "+-----------------------+\n"); + } +} +} // namespace vectorized +} // namespace doris \ No newline at end of file diff --git a/be/test/vec/exec/orc/orc_reader_init_column_test.cpp b/be/test/vec/exec/orc/orc_reader_init_column_test.cpp new file mode 100644 index 00000000000..44cc9cdfc59 --- /dev/null +++ b/be/test/vec/exec/orc/orc_reader_init_column_test.cpp @@ -0,0 +1,359 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <memory> + +#include "orc/ColumnPrinter.hh" +#include "orc_memory_stream_test.h" +#include "vec/core/types.h" +#include "vec/exec/format/orc/vorc_reader.h" + +namespace doris { +namespace vectorized { +class OrcReaderInitColumnTest : public ::testing::Test { +protected: + void SetUp() override {} + + void TearDown() override {} +}; +TEST_F(OrcReaderInitColumnTest, InitReadColumn) { + { + using namespace orc; + size_t rowCount = 10; + MemoryOutputStream memStream(100 * 1024 * 1024); + MemoryPool* pool = getDefaultPool(); + auto type = std::unique_ptr<Type>(Type::buildTypeFromString("struct<col1:int,col2:int>")); + WriterOptions options; + options.setMemoryPool(pool); + auto writer = createWriter(*type, &memStream, options); + auto batch = writer->createRowBatch(rowCount); + writer->add(*batch); + writer->close(); + + auto inStream = + std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength()); + ReaderOptions readerOptions; + readerOptions.setMemoryPool(*pool); + auto orc_reader = createReader(std::move(inStream), readerOptions); + + TFileScanRangeParams params; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + reader->_reader = std::move(orc_reader); + std::vector<std::string> tmp; + tmp.emplace_back("col1"); + + reader->_column_names = &tmp; + Status st = reader->_init_read_columns(); + std::cout << "st =" << st << "\n"; + std::list<std::string> ans; + ans.emplace_back("col1"); + ASSERT_EQ(ans, reader->_read_cols); + } + + { + using namespace orc; + size_t rowCount = 10; + MemoryOutputStream memStream(100 * 1024 * 1024); + MemoryPool* pool = getDefaultPool(); + auto type = std::unique_ptr<Type>(Type::buildTypeFromString("struct<col1:int,col2:int>")); + WriterOptions options; + options.setMemoryPool(pool); + auto writer = createWriter(*type, &memStream, options); + auto batch = writer->createRowBatch(rowCount); + writer->add(*batch); + writer->close(); + + auto inStream = + std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength()); + ReaderOptions readerOptions; + readerOptions.setMemoryPool(*pool); + auto orc_reader = createReader(std::move(inStream), readerOptions); + + TFileScanRangeParams params; + params.slot_name_to_schema_pos.insert({"xxxxx", 0}); + params.__isset.slot_name_to_schema_pos = true; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + reader->_reader = std::move(orc_reader); + reader->_is_hive1_orc_or_use_idx = true; + std::vector<std::string> column_names; + column_names.emplace_back("xxxxx"); + + reader->_column_names = &column_names; + Status st = reader->_init_read_columns(); + + std::cout << "st =" << st << "\n"; + + std::list<std::string> ans; + ans.emplace_back("col1"); + ASSERT_EQ(ans, reader->_read_cols); + } + { + using namespace orc; + size_t rowCount = 10; + MemoryOutputStream memStream(100 * 1024 * 1024); + MemoryPool* pool = getDefaultPool(); + auto type = std::unique_ptr<Type>( + Type::buildTypeFromString("struct<_col0:int,_col1:int,_col2:bigint>")); + WriterOptions options; + options.setMemoryPool(pool); + auto writer = createWriter(*type, &memStream, options); + auto batch = writer->createRowBatch(rowCount); + writer->add(*batch); + writer->close(); + + auto inStream = + std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength()); + ReaderOptions readerOptions; + readerOptions.setMemoryPool(*pool); + auto orc_reader = createReader(std::move(inStream), readerOptions); + + TFileScanRangeParams params; + params.slot_name_to_schema_pos.insert({"a", 0}); + params.slot_name_to_schema_pos.insert({"b", 1}); + params.slot_name_to_schema_pos.insert({"c", 2}); + + params.__isset.slot_name_to_schema_pos = true; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + reader->_reader = std::move(orc_reader); + std::vector<std::string> column_names; + column_names.emplace_back("b"); + column_names.emplace_back("c"); + + reader->_column_names = &column_names; + Status st = reader->_init_read_columns(); + + std::list<std::string> ans; + ans.emplace_back("_col1"); + ans.emplace_back("_col2"); + ASSERT_EQ(ans, reader->_read_cols); + } + + { + using namespace orc; + auto acid_type = createStructType(); + acid_type->addStructField("operation", createPrimitiveType(orc::TypeKind::INT)); + acid_type->addStructField("originalTransaction", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("bucket", createPrimitiveType(orc::TypeKind::INT)); + acid_type->addStructField("rowId", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("currentTransaction", createPrimitiveType(orc::TypeKind::LONG)); + auto row_type = createStructType(); + row_type->addStructField("CoL1", createPrimitiveType(orc::TypeKind::LONG)); + row_type->addStructField("col2", createPrimitiveType(orc::TypeKind::LONG)); + row_type->addStructField("colUMN3", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("row", std::move(row_type)); + + size_t rowCount = 10; + MemoryOutputStream memStream(100 * 1024 * 1024); + MemoryPool* pool = getDefaultPool(); + WriterOptions options; + options.setMemoryPool(pool); + auto writer = createWriter(*acid_type, &memStream, options); + auto batch = writer->createRowBatch(rowCount); + writer->add(*batch); + writer->close(); + + auto inStream = + std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength()); + ReaderOptions readerOptions; + readerOptions.setMemoryPool(*pool); + auto orc_reader = createReader(std::move(inStream), readerOptions); + + TFileScanRangeParams params; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, "", nullptr, true); + reader->_reader = std::move(orc_reader); + std::vector<std::string> column_names; + column_names.emplace_back("col1"); + column_names.emplace_back("column3"); + reader->_column_names = &column_names; + reader->_is_acid = true; + Status st = reader->_init_read_columns(); + + std::list<std::string> ans; + ans.emplace_back("row.CoL1"); + ans.emplace_back("row.colUMN3"); + ASSERT_EQ(ans, reader->_read_cols); + } +} + +TEST_F(OrcReaderInitColumnTest, CheckAcidSchemaTest) { + using namespace orc; + TFileScanRangeParams params; + TFileRangeDesc range; + auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + // 1. Test standard ACID schema + { + // Create standard ACID structure + auto acid_type = createStructType(); + acid_type->addStructField("operation", createPrimitiveType(orc::TypeKind::INT)); + acid_type->addStructField("originalTransaction", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("bucket", createPrimitiveType(orc::TypeKind::INT)); + acid_type->addStructField("rowId", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("currentTransaction", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("row", createStructType()); + + ASSERT_TRUE(_reader->_check_acid_schema(*acid_type)); + } + + // 2. Test case-insensitive field names + { + auto acid_type = createStructType(); + acid_type->addStructField("OPERATION", createPrimitiveType(orc::TypeKind::INT)); + acid_type->addStructField("OriginalTransaction", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("Bucket", createPrimitiveType(orc::TypeKind::INT)); + acid_type->addStructField("ROWID", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("currentTRANSACTION", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("ROW", createStructType()); + + ASSERT_TRUE(_reader->_check_acid_schema(*acid_type)); + } + + // 3. Test non-ACID schema - field count mismatch + { + auto non_acid_type = createStructType(); + non_acid_type->addStructField("operation", createPrimitiveType(orc::TypeKind::INT)); + non_acid_type->addStructField("originalTransaction", + createPrimitiveType(orc::TypeKind::LONG)); + // Only added two fields + + ASSERT_FALSE(_reader->_check_acid_schema(*non_acid_type)); + } + + // 4. Test non-ACID schema - field name mismatch + { + auto wrong_name_type = createStructType(); + wrong_name_type->addStructField("operation", createPrimitiveType(orc::TypeKind::INT)); + wrong_name_type->addStructField("wrongName", createPrimitiveType(orc::TypeKind::LONG)); + wrong_name_type->addStructField("bucket", createPrimitiveType(orc::TypeKind::INT)); + wrong_name_type->addStructField("rowId", createPrimitiveType(orc::TypeKind::LONG)); + wrong_name_type->addStructField("currentTransaction", + createPrimitiveType(orc::TypeKind::LONG)); + wrong_name_type->addStructField("row", createStructType()); + + ASSERT_FALSE(_reader->_check_acid_schema(*wrong_name_type)); + } + + // 5. Test non-struct type + { + auto int_type = createPrimitiveType(orc::TypeKind::INT); + ASSERT_FALSE(_reader->_check_acid_schema(*int_type)); + + auto string_type = createPrimitiveType(orc::TypeKind::STRING); + ASSERT_FALSE(_reader->_check_acid_schema(*string_type)); + } +} + +TEST_F(OrcReaderInitColumnTest, RemoveAcidTest) { + using namespace orc; + TFileScanRangeParams params; + TFileRangeDesc range; + auto _reader = OrcReader::create_unique(params, range, "", nullptr, true); + // 1. Test removing ACID info from ACID schema + { + // Create ACID schema + auto acid_type = createStructType(); + acid_type->addStructField("operation", createPrimitiveType(orc::TypeKind::INT)); + acid_type->addStructField("originalTransaction", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("bucket", createPrimitiveType(orc::TypeKind::INT)); + acid_type->addStructField("rowId", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("currentTransaction", createPrimitiveType(orc::TypeKind::LONG)); + + // Create actual data structure + auto row_type = createStructType(); + row_type->addStructField("id", createPrimitiveType(orc::TypeKind::INT)); + row_type->addStructField("name", createPrimitiveType(orc::TypeKind::STRING)); + acid_type->addStructField("row", std::move(row_type)); + + // Verify that after removing ACID we get the type of the row field + const orc::Type& removed_type = _reader->_remove_acid(*acid_type); + ASSERT_EQ(removed_type.getKind(), orc::TypeKind::STRUCT); + ASSERT_EQ(removed_type.getSubtypeCount(), 2); // id and name fields + ASSERT_EQ(removed_type.getFieldName(0), "id"); + ASSERT_EQ(removed_type.getFieldName(1), "name"); + } + + // 2. Test that non-ACID schema remains unchanged + { + // Create normal schema + auto normal_type = createStructType(); + normal_type->addStructField("field1", createPrimitiveType(orc::TypeKind::INT)); + normal_type->addStructField("field2", createPrimitiveType(orc::TypeKind::STRING)); + + const orc::Type& result_type = _reader->_remove_acid(*normal_type); + ASSERT_EQ(&result_type, normal_type.get()); // Should return the same type + ASSERT_EQ(result_type.getSubtypeCount(), 2); + ASSERT_EQ(result_type.getFieldName(0), "field1"); + ASSERT_EQ(result_type.getFieldName(1), "field2"); + } + + // 3. Test primitive types (non-struct) remain unchanged + { + auto int_type = createPrimitiveType(orc::TypeKind::INT); + const orc::Type& result_type = _reader->_remove_acid(*int_type); + ASSERT_EQ(&result_type, int_type.get()); + ASSERT_EQ(result_type.getKind(), orc::TypeKind::INT); + } + + // 4. Test complex nested ACID schema + { + // Create nested ACID schema + auto acid_type = createStructType(); + acid_type->addStructField("operation", createPrimitiveType(orc::TypeKind::INT)); + acid_type->addStructField("originalTransaction", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("bucket", createPrimitiveType(orc::TypeKind::INT)); + acid_type->addStructField("rowId", createPrimitiveType(orc::TypeKind::LONG)); + acid_type->addStructField("currentTransaction", createPrimitiveType(orc::TypeKind::LONG)); + + // Create complex row structure + auto row_type = createStructType(); + + // Add basic fields + row_type->addStructField("id", createPrimitiveType(orc::TypeKind::INT)); + + // Add array field + auto array_type = createListType(createPrimitiveType(orc::TypeKind::STRING)); + row_type->addStructField("tags", std::move(array_type)); + + // Add Map field + auto map_type = createMapType(createPrimitiveType(orc::TypeKind::STRING), + createPrimitiveType(orc::TypeKind::INT)); + row_type->addStructField("properties", std::move(map_type)); + + acid_type->addStructField("row", std::move(row_type)); + + // Verify structure after removing ACID + const orc::Type& removed_type = _reader->_remove_acid(*acid_type); + ASSERT_EQ(removed_type.getKind(), orc::TypeKind::STRUCT); + ASSERT_EQ(removed_type.getSubtypeCount(), 3); // id, tags, properties + ASSERT_EQ(removed_type.getFieldName(0), "id"); + ASSERT_EQ(removed_type.getFieldName(1), "tags"); + ASSERT_EQ(removed_type.getFieldName(2), "properties"); + + // Verify field types + ASSERT_EQ(removed_type.getSubtype(0)->getKind(), orc::TypeKind::INT); + ASSERT_EQ(removed_type.getSubtype(1)->getKind(), orc::TypeKind::LIST); + ASSERT_EQ(removed_type.getSubtype(2)->getKind(), orc::TypeKind::MAP); + } +} + +} // namespace vectorized +} // namespace doris --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org