This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 38e529cd296 [cherry-pick](branch-2.1) support decimal256 for parquet reader (#42241) 38e529cd296 is described below commit 38e529cd2969dc9088bc2a5798ada93f008b1ae8 Author: Socrates <suxiaogang...@icloud.com> AuthorDate: Tue Oct 22 19:42:09 2024 +0800 [cherry-pick](branch-2.1) support decimal256 for parquet reader (#42241) ## Proposed changes pick pr: https://github.com/apache/doris/pull/41526 --- be/src/gutil/endian.h | 13 +++++++++++-- be/src/util/bit_util.h | 9 ++++++++- be/src/vec/core/wide_integer.h | 1 + .../exec/format/parquet/parquet_column_convert.cpp | 5 ++++- .../vec/exec/format/parquet/parquet_column_convert.h | 19 ++++++++++++++++++- be/test/util/bit_util_test.cpp | 19 ++++++++++++++++++- .../hdfs_tvf/test_parquet_decimal256.parquet | Bin 0 -> 1320 bytes .../external_table_p0/tvf/test_hdfs_tvf.groovy | 8 ++++++++ 8 files changed, 68 insertions(+), 6 deletions(-) diff --git a/be/src/gutil/endian.h b/be/src/gutil/endian.h index f1a9cf2a1a2..6af893ea7a3 100644 --- a/be/src/gutil/endian.h +++ b/be/src/gutil/endian.h @@ -61,8 +61,8 @@ inline unsigned __int128 gbswap_128(unsigned __int128 host_int) { } inline wide::UInt256 gbswap_256(wide::UInt256 host_int) { - wide::UInt256 result{gbswap_64(host_int.items[3]), gbswap_64(host_int.items[2]), - gbswap_64(host_int.items[1]), gbswap_64(host_int.items[0])}; + wide::UInt256 result {gbswap_64(host_int.items[3]), gbswap_64(host_int.items[2]), + gbswap_64(host_int.items[1]), gbswap_64(host_int.items[0])}; return result; } @@ -137,6 +137,9 @@ public: static unsigned __int128 FromHost128(unsigned __int128 x) { return x; } static unsigned __int128 ToHost128(unsigned __int128 x) { return x; } + static wide::UInt256 FromHost256(wide::UInt256 x) { return x; } + static wide::UInt256 ToHost256(wide::UInt256 x) { return x; } + static bool IsLittleEndian() { return true; } #elif defined IS_BIG_ENDIAN @@ -150,6 +153,12 @@ public: static uint64 FromHost64(uint64 x) { return gbswap_64(x); } static uint64 ToHost64(uint64 x) { return gbswap_64(x); } + static unsigned __int128 FromHost128(unsigned __int128 x) { return gbswap_128(x); } + static unsigned __int128 ToHost128(unsigned __int128 x) { return gbswap_128(x); } + + static wide::UInt256 FromHost256(wide::UInt256 x) { return gbswap_256(x); } + static wide::UInt256 ToHost256(wide::UInt256 x) { return gbswap_256(x); } + static bool IsLittleEndian() { return false; } #endif /* ENDIAN */ diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h index 6934f45ef3e..6b7385c0613 100644 --- a/be/src/util/bit_util.h +++ b/be/src/util/bit_util.h @@ -20,6 +20,9 @@ #pragma once +#include <type_traits> + +#include "vec/core/wide_integer.h" #ifndef __APPLE__ #include <endian.h> #endif @@ -209,7 +212,11 @@ public: template <typename T> static T big_endian_to_host(T value) { - if constexpr (std::is_same_v<T, __int128>) { + if constexpr (std::is_same_v<T, wide::Int256>) { + return BigEndian::ToHost256(value); + } else if constexpr (std::is_same_v<T, wide::UInt256>) { + return BigEndian::ToHost256(value); + } else if constexpr (std::is_same_v<T, __int128>) { return BigEndian::ToHost128(value); } else if constexpr (std::is_same_v<T, unsigned __int128>) { return BigEndian::ToHost128(value); diff --git a/be/src/vec/core/wide_integer.h b/be/src/vec/core/wide_integer.h index e7902e414a8..261a41d16b9 100644 --- a/be/src/vec/core/wide_integer.h +++ b/be/src/vec/core/wide_integer.h @@ -40,6 +40,7 @@ // and modified by Doris #pragma once +#include <cstddef> #include <cstdint> #include <initializer_list> #include <limits> diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp index 2fb0afea82a..0a5ef2913dd 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -19,6 +19,7 @@ #include <cctz/time_zone.h> +#include "runtime/define_primitive_type.h" #include "vec/columns/column_nullable.h" namespace doris::vectorized::parquet { const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); @@ -27,7 +28,8 @@ const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); M(TYPE_DECIMALV2) \ M(TYPE_DECIMAL32) \ M(TYPE_DECIMAL64) \ - M(TYPE_DECIMAL128I) + M(TYPE_DECIMAL128I) \ + M(TYPE_DECIMAL256) bool PhysicalToLogicalConverter::is_parquet_native_type(PrimitiveType type) { switch (type) { @@ -50,6 +52,7 @@ bool PhysicalToLogicalConverter::is_decimal_type(doris::PrimitiveType type) { case TYPE_DECIMAL32: case TYPE_DECIMAL64: case TYPE_DECIMAL128I: + case TYPE_DECIMAL256: case TYPE_DECIMALV2: return true; default: diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h index 91b81121aa4..cf6f8aa13fa 100644 --- a/be/src/vec/exec/format/parquet/parquet_column_convert.h +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -20,6 +20,7 @@ #include <gen_cpp/parquet_types.h> #include "vec/core/types.h" +#include "vec/core/wide_integer.h" #include "vec/data_types/data_type_factory.hpp" #include "vec/exec/format/column_type_convert.h" #include "vec/exec/format/format_common.h" @@ -401,7 +402,23 @@ public: M(13, int128_t) \ M(14, int128_t) \ M(15, int128_t) \ - M(16, int128_t) + M(16, int128_t) \ + M(17, wide::Int256) \ + M(18, wide::Int256) \ + M(19, wide::Int256) \ + M(20, wide::Int256) \ + M(21, wide::Int256) \ + M(22, wide::Int256) \ + M(23, wide::Int256) \ + M(24, wide::Int256) \ + M(25, wide::Int256) \ + M(26, wide::Int256) \ + M(27, wide::Int256) \ + M(28, wide::Int256) \ + M(29, wide::Int256) \ + M(30, wide::Int256) \ + M(31, wide::Int256) \ + M(32, wide::Int256) switch (_type_length) { APPLY_FOR_DECIMALS() diff --git a/be/test/util/bit_util_test.cpp b/be/test/util/bit_util_test.cpp index 514daafa604..fd3bee01432 100644 --- a/be/test/util/bit_util_test.cpp +++ b/be/test/util/bit_util_test.cpp @@ -21,7 +21,6 @@ #include <gtest/gtest-test-part.h> #include <boost/utility/binary.hpp> -#include <memory> #include "gtest/gtest_pred_impl.h" @@ -48,4 +47,22 @@ TEST(BitUtil, Popcount) { EXPECT_EQ(BitUtil::popcount_no_hw(0), 0); } +TEST(BitUtil, BigEndianToHost) { + uint16_t v16 = 0x1234; + uint32_t v32 = 0x12345678; + uint64_t v64 = 0x123456789abcdef0; + unsigned __int128 v128 = ((__int128)0x123456789abcdef0LL << 64) | 0x123456789abcdef0LL; + wide::UInt256 v256 = + wide::UInt256(0x123456789abcdef0) << 192 | wide::UInt256(0x123456789abcdef0) << 128 | + wide::UInt256(0x123456789abcdef0) << 64 | wide::UInt256(0x123456789abcdef0); + EXPECT_EQ(BitUtil::big_endian_to_host(v16), 0x3412); + EXPECT_EQ(BitUtil::big_endian_to_host(v32), 0x78563412); + EXPECT_EQ(BitUtil::big_endian_to_host(v64), 0xf0debc9a78563412); + EXPECT_EQ(BitUtil::big_endian_to_host(v128), + ((__int128)0xf0debc9a78563412LL << 64) | 0xf0debc9a78563412LL); + EXPECT_EQ(BitUtil::big_endian_to_host(v256), + wide::UInt256(0xf0debc9a78563412) << 192 | wide::UInt256(0xf0debc9a78563412) << 128 | + wide::UInt256(0xf0debc9a78563412) << 64 | wide::UInt256(0xf0debc9a78563412)); +} + } // namespace doris diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet new file mode 100644 index 00000000000..323ded32160 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet differ diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy index be27e213f6b..764c4842229 100644 --- a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy +++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy @@ -116,6 +116,14 @@ suite("test_hdfs_tvf","external,hive,tvf,external_docker") { "hadoop.username" = "${hdfsUserName}", "format" = "${format}") order by s_suppkey limit 20; """ + // test parquet decimal256 + uri = "${defaultFS}" + "/user/doris/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet" + format = "parquet" + qt_parquet_decimal256 """ select * from HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "${format}") order by id; """ + // test orc uri = "${defaultFS}" + "/user/doris/preinstalled_data/hdfs_tvf/test_orc.snappy.orc" format = "orc" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org