(doris) branch branch-2.1 updated: [cherry-pick](branch-2.1) support decimal256 for parquet reader (#42241)

morningman Tue, 22 Oct 2024 04:42:21 -0700

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 38e529cd296 [cherry-pick](branch-2.1) support decimal256 for parquet 
reader (#42241)
38e529cd296 is described below

commit 38e529cd2969dc9088bc2a5798ada93f008b1ae8
Author: Socrates <suxiaogang...@icloud.com>
AuthorDate: Tue Oct 22 19:42:09 2024 +0800

    [cherry-pick](branch-2.1) support decimal256 for parquet reader (#42241)
    
    ## Proposed changes
    pick pr: https://github.com/apache/doris/pull/41526
---
 be/src/gutil/endian.h                                |  13 +++++++++++--
 be/src/util/bit_util.h                               |   9 ++++++++-
 be/src/vec/core/wide_integer.h                       |   1 +
 .../exec/format/parquet/parquet_column_convert.cpp   |   5 ++++-
 .../vec/exec/format/parquet/parquet_column_convert.h |  19 ++++++++++++++++++-
 be/test/util/bit_util_test.cpp                       |  19 ++++++++++++++++++-
 .../hdfs_tvf/test_parquet_decimal256.parquet         | Bin 0 -> 1320 bytes
 .../external_table_p0/tvf/test_hdfs_tvf.groovy       |   8 ++++++++
 8 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/be/src/gutil/endian.h b/be/src/gutil/endian.h
index f1a9cf2a1a2..6af893ea7a3 100644
--- a/be/src/gutil/endian.h
+++ b/be/src/gutil/endian.h
@@ -61,8 +61,8 @@ inline unsigned __int128 gbswap_128(unsigned __int128 
host_int) {
 }
 
 inline wide::UInt256 gbswap_256(wide::UInt256 host_int) {
-    wide::UInt256 result{gbswap_64(host_int.items[3]), 
gbswap_64(host_int.items[2]),
-                         gbswap_64(host_int.items[1]), 
gbswap_64(host_int.items[0])};
+    wide::UInt256 result {gbswap_64(host_int.items[3]), 
gbswap_64(host_int.items[2]),
+                          gbswap_64(host_int.items[1]), 
gbswap_64(host_int.items[0])};
     return result;
 }
 
@@ -137,6 +137,9 @@ public:
     static unsigned __int128 FromHost128(unsigned __int128 x) { return x; }
     static unsigned __int128 ToHost128(unsigned __int128 x) { return x; }
 
+    static wide::UInt256 FromHost256(wide::UInt256 x) { return x; }
+    static wide::UInt256 ToHost256(wide::UInt256 x) { return x; }
+
     static bool IsLittleEndian() { return true; }
 
 #elif defined IS_BIG_ENDIAN
@@ -150,6 +153,12 @@ public:
     static uint64 FromHost64(uint64 x) { return gbswap_64(x); }
     static uint64 ToHost64(uint64 x) { return gbswap_64(x); }
 
+    static unsigned __int128 FromHost128(unsigned __int128 x) { return 
gbswap_128(x); }
+    static unsigned __int128 ToHost128(unsigned __int128 x) { return 
gbswap_128(x); }
+
+    static wide::UInt256 FromHost256(wide::UInt256 x) { return gbswap_256(x); }
+    static wide::UInt256 ToHost256(wide::UInt256 x) { return gbswap_256(x); }
+
     static bool IsLittleEndian() { return false; }
 
 #endif /* ENDIAN */
diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h
index 6934f45ef3e..6b7385c0613 100644
--- a/be/src/util/bit_util.h
+++ b/be/src/util/bit_util.h
@@ -20,6 +20,9 @@
 
 #pragma once
 
+#include <type_traits>
+
+#include "vec/core/wide_integer.h"
 #ifndef __APPLE__
 #include <endian.h>
 #endif
@@ -209,7 +212,11 @@ public:
 
     template <typename T>
     static T big_endian_to_host(T value) {
-        if constexpr (std::is_same_v<T, __int128>) {
+        if constexpr (std::is_same_v<T, wide::Int256>) {
+            return BigEndian::ToHost256(value);
+        } else if constexpr (std::is_same_v<T, wide::UInt256>) {
+            return BigEndian::ToHost256(value);
+        } else if constexpr (std::is_same_v<T, __int128>) {
             return BigEndian::ToHost128(value);
         } else if constexpr (std::is_same_v<T, unsigned __int128>) {
             return BigEndian::ToHost128(value);
diff --git a/be/src/vec/core/wide_integer.h b/be/src/vec/core/wide_integer.h
index e7902e414a8..261a41d16b9 100644
--- a/be/src/vec/core/wide_integer.h
+++ b/be/src/vec/core/wide_integer.h
@@ -40,6 +40,7 @@
 // and modified by Doris
 #pragma once
 
+#include <cstddef>
 #include <cstdint>
 #include <initializer_list>
 #include <limits>
diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp 
b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
index 2fb0afea82a..0a5ef2913dd 100644
--- a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
+++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp
@@ -19,6 +19,7 @@
 
 #include <cctz/time_zone.h>
 
+#include "runtime/define_primitive_type.h"
 #include "vec/columns/column_nullable.h"
 namespace doris::vectorized::parquet {
 const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone();
@@ -27,7 +28,8 @@ const cctz::time_zone ConvertParams::utc0 = 
cctz::utc_time_zone();
     M(TYPE_DECIMALV2)                \
     M(TYPE_DECIMAL32)                \
     M(TYPE_DECIMAL64)                \
-    M(TYPE_DECIMAL128I)
+    M(TYPE_DECIMAL128I)              \
+    M(TYPE_DECIMAL256)
 
 bool PhysicalToLogicalConverter::is_parquet_native_type(PrimitiveType type) {
     switch (type) {
@@ -50,6 +52,7 @@ bool 
PhysicalToLogicalConverter::is_decimal_type(doris::PrimitiveType type) {
     case TYPE_DECIMAL32:
     case TYPE_DECIMAL64:
     case TYPE_DECIMAL128I:
+    case TYPE_DECIMAL256:
     case TYPE_DECIMALV2:
         return true;
     default:
diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h 
b/be/src/vec/exec/format/parquet/parquet_column_convert.h
index 91b81121aa4..cf6f8aa13fa 100644
--- a/be/src/vec/exec/format/parquet/parquet_column_convert.h
+++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h
@@ -20,6 +20,7 @@
 #include <gen_cpp/parquet_types.h>
 
 #include "vec/core/types.h"
+#include "vec/core/wide_integer.h"
 #include "vec/data_types/data_type_factory.hpp"
 #include "vec/exec/format/column_type_convert.h"
 #include "vec/exec/format/format_common.h"
@@ -401,7 +402,23 @@ public:
     M(13, int128_t)          \
     M(14, int128_t)          \
     M(15, int128_t)          \
-    M(16, int128_t)
+    M(16, int128_t)          \
+    M(17, wide::Int256)      \
+    M(18, wide::Int256)      \
+    M(19, wide::Int256)      \
+    M(20, wide::Int256)      \
+    M(21, wide::Int256)      \
+    M(22, wide::Int256)      \
+    M(23, wide::Int256)      \
+    M(24, wide::Int256)      \
+    M(25, wide::Int256)      \
+    M(26, wide::Int256)      \
+    M(27, wide::Int256)      \
+    M(28, wide::Int256)      \
+    M(29, wide::Int256)      \
+    M(30, wide::Int256)      \
+    M(31, wide::Int256)      \
+    M(32, wide::Int256)
 
         switch (_type_length) {
             APPLY_FOR_DECIMALS()
diff --git a/be/test/util/bit_util_test.cpp b/be/test/util/bit_util_test.cpp
index 514daafa604..fd3bee01432 100644
--- a/be/test/util/bit_util_test.cpp
+++ b/be/test/util/bit_util_test.cpp
@@ -21,7 +21,6 @@
 #include <gtest/gtest-test-part.h>
 
 #include <boost/utility/binary.hpp>
-#include <memory>
 
 #include "gtest/gtest_pred_impl.h"
 
@@ -48,4 +47,22 @@ TEST(BitUtil, Popcount) {
     EXPECT_EQ(BitUtil::popcount_no_hw(0), 0);
 }
 
+TEST(BitUtil, BigEndianToHost) {
+    uint16_t v16 = 0x1234;
+    uint32_t v32 = 0x12345678;
+    uint64_t v64 = 0x123456789abcdef0;
+    unsigned __int128 v128 = ((__int128)0x123456789abcdef0LL << 64) | 
0x123456789abcdef0LL;
+    wide::UInt256 v256 =
+            wide::UInt256(0x123456789abcdef0) << 192 | 
wide::UInt256(0x123456789abcdef0) << 128 |
+            wide::UInt256(0x123456789abcdef0) << 64 | 
wide::UInt256(0x123456789abcdef0);
+    EXPECT_EQ(BitUtil::big_endian_to_host(v16), 0x3412);
+    EXPECT_EQ(BitUtil::big_endian_to_host(v32), 0x78563412);
+    EXPECT_EQ(BitUtil::big_endian_to_host(v64), 0xf0debc9a78563412);
+    EXPECT_EQ(BitUtil::big_endian_to_host(v128),
+              ((__int128)0xf0debc9a78563412LL << 64) | 0xf0debc9a78563412LL);
+    EXPECT_EQ(BitUtil::big_endian_to_host(v256),
+              wide::UInt256(0xf0debc9a78563412) << 192 | 
wide::UInt256(0xf0debc9a78563412) << 128 |
+                      wide::UInt256(0xf0debc9a78563412) << 64 | 
wide::UInt256(0xf0debc9a78563412));
+}
+
 } // namespace doris
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet
new file mode 100644
index 00000000000..323ded32160
Binary files /dev/null and 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet
 differ
diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy 
b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
index be27e213f6b..764c4842229 100644
--- a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
+++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
@@ -116,6 +116,14 @@ suite("test_hdfs_tvf","external,hive,tvf,external_docker") 
{
                             "hadoop.username" = "${hdfsUserName}",
                             "format" = "${format}") order by s_suppkey limit 
20; """
 
+            // test parquet decimal256
+            uri = "${defaultFS}" + 
"/user/doris/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet"
+            format = "parquet"
+            qt_parquet_decimal256 """ select * from HDFS(
+                            "uri" = "${uri}",
+                            "hadoop.username" = "${hdfsUserName}",
+                            "format" = "${format}") order by id; """
+
             // test orc
             uri = "${defaultFS}" + 
"/user/doris/preinstalled_data/hdfs_tvf/test_orc.snappy.orc"
             format = "orc"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) branch branch-2.1 updated: [cherry-pick](branch-2.1) support decimal256 for parquet reader (#42241)

Reply via email to