This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new ff6d2ea7559 [enhance](parquet) support reading brotli compressed parquet file (#41875) ff6d2ea7559 is described below commit ff6d2ea7559e3ee88e3cc60726e8749f132527c7 Author: Socrates <suxiaogang...@icloud.com> AuthorDate: Mon Oct 21 09:42:54 2024 +0800 [enhance](parquet) support reading brotli compressed parquet file (#41875) ## Proposed changes Impl BrotliBlockCompression to uncompressed brotli parquet data. fix parquet case: group0/large_string_map.brotli.parquet --- be/src/util/block_compression.cpp | 37 +++++++++++++++++---- .../hdfs_tvf/test_parquet.brotli.parquet | Bin 0 -> 291443 bytes .../tvf/test_hdfs_parquet_group0.out | Bin 23993 -> 24011 bytes .../data/external_table_p0/tvf/test_hdfs_tvf.out | 22 ++++++++++++ .../tvf/test_hdfs_parquet_group0.groovy | 7 ++-- .../external_table_p0/tvf/test_hdfs_tvf.groovy | 8 +++++ 6 files changed, 62 insertions(+), 12 deletions(-) diff --git a/be/src/util/block_compression.cpp b/be/src/util/block_compression.cpp index ae672068119..d13c0c091b9 100644 --- a/be/src/util/block_compression.cpp +++ b/be/src/util/block_compression.cpp @@ -28,24 +28,23 @@ defined(__i386) || defined(_M_IX86) #include <libdeflate.h> #endif +#include <brotli/decode.h> #include <glog/log_severity.h> #include <glog/logging.h> -#include <limits.h> #include <lz4/lz4.h> #include <lz4/lz4frame.h> #include <lz4/lz4hc.h> #include <snappy/snappy-sinksource.h> #include <snappy/snappy.h> -#include <stdint.h> #include <zconf.h> #include <zlib.h> #include <zstd.h> #include <zstd_errors.h> #include <algorithm> +#include <cstdint> #include <limits> #include <mutex> -#include <new> #include <ostream> #include "common/config.h" @@ -53,9 +52,7 @@ #include "exec/decompressor.h" #include "gutil/endian.h" #include "gutil/strings/substitute.h" -#include "orc/OrcFile.hh" #include "runtime/thread_context.h" -#include "util/bit_util.h" #include "util/defer_op.h" #include "util/faststring.h" @@ -74,8 +71,6 @@ uint64_t lzoDecompress(const char* inputAddress, const char* inputLimit, char* o namespace doris { -using strings::Substitute; - // exception safe Status BlockCompressionCodec::compress(const std::vector<Slice>& inputs, size_t uncompressed_size, faststring* output) { @@ -1492,6 +1487,31 @@ public: } }; +class BrotliBlockCompression final : public BlockCompressionCodec { +public: + static BrotliBlockCompression* instance() { + static BrotliBlockCompression s_instance; + return &s_instance; + } + + Status compress(const Slice& input, faststring* output) override { + return Status::InvalidArgument("not impl brotli compress."); + } + + size_t max_compressed_len(size_t len) override { return 0; }; + + Status decompress(const Slice& input, Slice* output) override { + // The size of output buffer is always equal to the umcompressed length. + BrotliDecoderResult result = BrotliDecoderDecompress( + input.get_size(), reinterpret_cast<const uint8_t*>(input.get_data()), &output->size, + reinterpret_cast<uint8_t*>(output->data)); + if (result != BROTLI_DECODER_RESULT_SUCCESS) { + return Status::InternalError("Brotli decompression failed, result={}", result); + } + return Status::OK(); + } +}; + Status get_block_compression_codec(segment_v2::CompressionTypePB type, BlockCompressionCodec** codec) { switch (type) { @@ -1582,6 +1602,9 @@ Status get_block_compression_codec(tparquet::CompressionCodec::type parquet_code case tparquet::CompressionCodec::LZO: *codec = LzoBlockCompression::instance(); break; + case tparquet::CompressionCodec::BROTLI: + *codec = BrotliBlockCompression::instance(); + break; default: return Status::InternalError("unknown compression type({})", parquet_codec); } diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet new file mode 100644 index 00000000000..be60868a398 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet differ diff --git a/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out b/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out index 6b58c1478b1..7ed43af1f35 100644 Binary files a/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out and b/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out differ diff --git a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out index e850e38a237..a8f5dcf5396 100644 --- a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out +++ b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out @@ -221,6 +221,28 @@ 19 Supplier#000000019 edZT3es,nBFD8lBXTGeTl 24 34-278-310-2731 6150.38 refully final foxes across the dogged theodolites sleep slyly abou 20 Supplier#000000020 iybAE,RmTymrZVYaFZva2SH,j 3 13-715-945-6730 530.82 n, ironic ideas would nag blithely about the slyly regular accounts. silent, expr +-- !parquet_brotli -- +1 Supplier#000000001 N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ 17 27-918-335-1736 5755.94 each slyly above the careful +2 Supplier#000000002 89eJ5ksX3ImxJQBvxObC, 5 15-679-861-2259 4032.68 slyly bold instructions. idle dependen +3 Supplier#000000003 q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3 1 11-383-516-1199 4192.40 blithely silent requests after the express dependencies are sl +4 Supplier#000000004 Bk7ah4CK8SYQTepEmvMkkgMwg 15 25-843-787-7479 4641.08 riously even requests above the exp +5 Supplier#000000005 Gcdm2rJRzl5qlTVzc 11 21-151-690-3663 -283.84 . slyly regular pinto bea +6 Supplier#000000006 tQxuVm7s7CnK 14 24-696-997-4969 1365.79 final accounts. regular dolphins use against the furiously ironic decoys. +7 Supplier#000000007 s,4TicNGB4uO6PaSqNBUq 23 33-990-965-2201 6820.35 s unwind silently furiously regular courts. final requests are deposits. requests wake quietly blit +8 Supplier#000000008 9Sq4bBH2FQEmaFOocY45sRTxo6yuoG 17 27-498-742-3860 7627.85 al pinto beans. asymptotes haggl +9 Supplier#000000009 1KhUgZegwM3ua7dsYmekYBsK 10 20-403-398-8662 5302.37 s. unusual, even requests along the furiously regular pac +10 Supplier#000000010 Saygah3gYWMp72i PY 24 34-852-489-8585 3891.91 ing waters. regular requests ar +11 Supplier#000000011 JfwTs,LZrV, M,9C 18 28-613-996-1505 3393.08 y ironic packages. slyly ironic accounts affix furiously; ironically unusual excuses across the flu +12 Supplier#000000012 aLIW q0HYd 8 18-179-925-7181 1432.69 al packages nag alongside of the bold instructions. express, daring accounts +13 Supplier#000000013 HK71HQyWoqRWOX8GI FpgAifW,2PoH 3 13-727-620-7813 9107.22 requests engage regularly instructions. furiously special requests ar +14 Supplier#000000014 EXsnO5pTNj4iZRm 15 25-656-247-5058 9189.82 l accounts boost. fluffily bold warhorses wake +15 Supplier#000000015 olXVbNBfVzRqgokr1T,Ie 8 18-453-357-6394 308.56 across the furiously regular platelets wake even deposits. quickly express she +16 Supplier#000000016 YjP5C55zHDXL7LalK27zfQnwejdpin4AMpvh 22 32-822-502-4215 2972.26 ously express ideas haggle quickly dugouts? fu +17 Supplier#000000017 c2d,ESHRSkK3WYnxpgw6aOqN0q 19 29-601-884-9219 1687.81 eep against the furiously bold ideas. fluffily bold packa +18 Supplier#000000018 PGGVE5PWAMwKDZw 16 26-729-551-1115 7040.82 accounts snooze slyly furiously bold +19 Supplier#000000019 edZT3es,nBFD8lBXTGeTl 24 34-278-310-2731 6150.38 refully final foxes across the dogged theodolites sleep slyly abou +20 Supplier#000000020 iybAE,RmTymrZVYaFZva2SH,j 3 13-715-945-6730 530.82 n, ironic ideas would nag blithely about the slyly regular accounts. silent, expr + -- !parquet_decimal256 -- 1 99999999999999999999999999999999999999.99999999999999999999999999999999999999 2 -99999999999999999999999999999999999999.99999999999999999999999999999999999999 diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy b/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy index 65d6732e272..47fc8574a34 100644 --- a/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy +++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy @@ -104,13 +104,10 @@ suite("test_hdfs_parquet_group0","external,hive,tvf,external_docker") { uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_parquet/group0/large_string_map.brotli.parquet" - test { - sql """ select * from HDFS( + order_qt_test_11 """ select count(arr) from HDFS( "uri" = "${uri}", "hadoop.username" = "${hdfsUserName}", - "format" = "parquet") limit 10; """ - exception "unknown compression type(4)" - } + "format" = "parquet"); """ uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_parquet/group0/non_hadoop_lz4_compressed.parquet" diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy index 02bda4ec0dd..74cb1e320aa 100644 --- a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy +++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy @@ -107,6 +107,14 @@ suite("test_hdfs_tvf","external,hive,tvf,external_docker") { "uri" = "${uri}", "hadoop.username" = "${hdfsUserName}", "format" = "${format}") order by s_suppkey limit 20; """ + + // test parquet brotli + uri = "${defaultFS}" + "/user/doris/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet" + format = "parquet" + qt_parquet_brotli """ select * from HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "${format}") order by s_suppkey limit 20; """ // test parquet decimal256 uri = "${defaultFS}" + "/user/doris/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org