This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new ff6d2ea7559 [enhance](parquet) support reading brotli compressed 
parquet file (#41875)
ff6d2ea7559 is described below

commit ff6d2ea7559e3ee88e3cc60726e8749f132527c7
Author: Socrates <suxiaogang...@icloud.com>
AuthorDate: Mon Oct 21 09:42:54 2024 +0800

    [enhance](parquet) support reading brotli compressed parquet file (#41875)
    
    ## Proposed changes
    
    Impl BrotliBlockCompression to uncompressed brotli parquet data.
    fix parquet case: group0/large_string_map.brotli.parquet
---
 be/src/util/block_compression.cpp                  |  37 +++++++++++++++++----
 .../hdfs_tvf/test_parquet.brotli.parquet           | Bin 0 -> 291443 bytes
 .../tvf/test_hdfs_parquet_group0.out               | Bin 23993 -> 24011 bytes
 .../data/external_table_p0/tvf/test_hdfs_tvf.out   |  22 ++++++++++++
 .../tvf/test_hdfs_parquet_group0.groovy            |   7 ++--
 .../external_table_p0/tvf/test_hdfs_tvf.groovy     |   8 +++++
 6 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/be/src/util/block_compression.cpp 
b/be/src/util/block_compression.cpp
index ae672068119..d13c0c091b9 100644
--- a/be/src/util/block_compression.cpp
+++ b/be/src/util/block_compression.cpp
@@ -28,24 +28,23 @@
         defined(__i386) || defined(_M_IX86)
 #include <libdeflate.h>
 #endif
+#include <brotli/decode.h>
 #include <glog/log_severity.h>
 #include <glog/logging.h>
-#include <limits.h>
 #include <lz4/lz4.h>
 #include <lz4/lz4frame.h>
 #include <lz4/lz4hc.h>
 #include <snappy/snappy-sinksource.h>
 #include <snappy/snappy.h>
-#include <stdint.h>
 #include <zconf.h>
 #include <zlib.h>
 #include <zstd.h>
 #include <zstd_errors.h>
 
 #include <algorithm>
+#include <cstdint>
 #include <limits>
 #include <mutex>
-#include <new>
 #include <ostream>
 
 #include "common/config.h"
@@ -53,9 +52,7 @@
 #include "exec/decompressor.h"
 #include "gutil/endian.h"
 #include "gutil/strings/substitute.h"
-#include "orc/OrcFile.hh"
 #include "runtime/thread_context.h"
-#include "util/bit_util.h"
 #include "util/defer_op.h"
 #include "util/faststring.h"
 
@@ -74,8 +71,6 @@ uint64_t lzoDecompress(const char* inputAddress, const char* 
inputLimit, char* o
 
 namespace doris {
 
-using strings::Substitute;
-
 // exception safe
 Status BlockCompressionCodec::compress(const std::vector<Slice>& inputs, 
size_t uncompressed_size,
                                        faststring* output) {
@@ -1492,6 +1487,31 @@ public:
     }
 };
 
+class BrotliBlockCompression final : public BlockCompressionCodec {
+public:
+    static BrotliBlockCompression* instance() {
+        static BrotliBlockCompression s_instance;
+        return &s_instance;
+    }
+
+    Status compress(const Slice& input, faststring* output) override {
+        return Status::InvalidArgument("not impl brotli compress.");
+    }
+
+    size_t max_compressed_len(size_t len) override { return 0; };
+
+    Status decompress(const Slice& input, Slice* output) override {
+        // The size of output buffer is always equal to the umcompressed 
length.
+        BrotliDecoderResult result = BrotliDecoderDecompress(
+                input.get_size(), reinterpret_cast<const 
uint8_t*>(input.get_data()), &output->size,
+                reinterpret_cast<uint8_t*>(output->data));
+        if (result != BROTLI_DECODER_RESULT_SUCCESS) {
+            return Status::InternalError("Brotli decompression failed, 
result={}", result);
+        }
+        return Status::OK();
+    }
+};
+
 Status get_block_compression_codec(segment_v2::CompressionTypePB type,
                                    BlockCompressionCodec** codec) {
     switch (type) {
@@ -1582,6 +1602,9 @@ Status 
get_block_compression_codec(tparquet::CompressionCodec::type parquet_code
     case tparquet::CompressionCodec::LZO:
         *codec = LzoBlockCompression::instance();
         break;
+    case tparquet::CompressionCodec::BROTLI:
+        *codec = BrotliBlockCompression::instance();
+        break;
     default:
         return Status::InternalError("unknown compression type({})", 
parquet_codec);
     }
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet
new file mode 100644
index 00000000000..be60868a398
Binary files /dev/null and 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet
 differ
diff --git 
a/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out 
b/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out
index 6b58c1478b1..7ed43af1f35 100644
Binary files 
a/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out and 
b/regression-test/data/external_table_p0/tvf/test_hdfs_parquet_group0.out differ
diff --git a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out 
b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
index e850e38a237..a8f5dcf5396 100644
--- a/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
+++ b/regression-test/data/external_table_p0/tvf/test_hdfs_tvf.out
@@ -221,6 +221,28 @@
 19     Supplier#000000019      edZT3es,nBFD8lBXTGeTl   24      34-278-310-2731 
6150.38 refully final foxes across the dogged theodolites sleep slyly abou
 20     Supplier#000000020      iybAE,RmTymrZVYaFZva2SH,j       3       
13-715-945-6730 530.82  n, ironic ideas would nag blithely about the slyly 
regular accounts. silent, expr
 
+-- !parquet_brotli --
+1      Supplier#000000001       N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ     17      
27-918-335-1736 5755.94 each slyly above the careful
+2      Supplier#000000002      89eJ5ksX3ImxJQBvxObC,   5       15-679-861-2259 
4032.68  slyly bold instructions. idle dependen
+3      Supplier#000000003      q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3       1       
11-383-516-1199 4192.40 blithely silent requests after the express dependencies 
are sl
+4      Supplier#000000004      Bk7ah4CK8SYQTepEmvMkkgMwg       15      
25-843-787-7479 4641.08 riously even requests above the exp
+5      Supplier#000000005      Gcdm2rJRzl5qlTVzc       11      21-151-690-3663 
-283.84 . slyly regular pinto bea
+6      Supplier#000000006      tQxuVm7s7CnK    14      24-696-997-4969 1365.79 
final accounts. regular dolphins use against the furiously ironic decoys. 
+7      Supplier#000000007      s,4TicNGB4uO6PaSqNBUq   23      33-990-965-2201 
6820.35 s unwind silently furiously regular courts. final requests are 
deposits. requests wake quietly blit
+8      Supplier#000000008      9Sq4bBH2FQEmaFOocY45sRTxo6yuoG  17      
27-498-742-3860 7627.85 al pinto beans. asymptotes haggl
+9      Supplier#000000009      1KhUgZegwM3ua7dsYmekYBsK        10      
20-403-398-8662 5302.37 s. unusual, even requests along the furiously regular 
pac
+10     Supplier#000000010      Saygah3gYWMp72i PY      24      34-852-489-8585 
3891.91 ing waters. regular requests ar
+11     Supplier#000000011      JfwTs,LZrV, M,9C        18      28-613-996-1505 
3393.08 y ironic packages. slyly ironic accounts affix furiously; ironically 
unusual excuses across the flu
+12     Supplier#000000012      aLIW  q0HYd     8       18-179-925-7181 1432.69 
al packages nag alongside of the bold instructions. express, daring accounts
+13     Supplier#000000013      HK71HQyWoqRWOX8GI FpgAifW,2PoH  3       
13-727-620-7813 9107.22 requests engage regularly instructions. furiously 
special requests ar
+14     Supplier#000000014      EXsnO5pTNj4iZRm 15      25-656-247-5058 9189.82 
l accounts boost. fluffily bold warhorses wake
+15     Supplier#000000015      olXVbNBfVzRqgokr1T,Ie   8       18-453-357-6394 
308.56   across the furiously regular platelets wake even deposits. quickly 
express she
+16     Supplier#000000016      YjP5C55zHDXL7LalK27zfQnwejdpin4AMpvh    22      
32-822-502-4215 2972.26 ously express ideas haggle quickly dugouts? fu
+17     Supplier#000000017      c2d,ESHRSkK3WYnxpgw6aOqN0q      19      
29-601-884-9219 1687.81 eep against the furiously bold ideas. fluffily bold 
packa
+18     Supplier#000000018      PGGVE5PWAMwKDZw         16      26-729-551-1115 
7040.82 accounts snooze slyly furiously bold 
+19     Supplier#000000019      edZT3es,nBFD8lBXTGeTl   24      34-278-310-2731 
6150.38 refully final foxes across the dogged theodolites sleep slyly abou
+20     Supplier#000000020      iybAE,RmTymrZVYaFZva2SH,j       3       
13-715-945-6730 530.82  n, ironic ideas would nag blithely about the slyly 
regular accounts. silent, expr
+
 -- !parquet_decimal256 --
 1      
99999999999999999999999999999999999999.99999999999999999999999999999999999999
 2      
-99999999999999999999999999999999999999.99999999999999999999999999999999999999
diff --git 
a/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy 
b/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy
index 65d6732e272..47fc8574a34 100644
--- 
a/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy
+++ 
b/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group0.groovy
@@ -104,13 +104,10 @@ 
suite("test_hdfs_parquet_group0","external,hive,tvf,external_docker") {
 
 
             uri = "${defaultFS}" + 
"/user/doris/tvf_data/test_hdfs_parquet/group0/large_string_map.brotli.parquet"
-            test {
-                sql """ select * from HDFS(
+            order_qt_test_11 """ select count(arr) from HDFS(
                         "uri" = "${uri}",
                         "hadoop.username" = "${hdfsUserName}",
-                        "format" = "parquet") limit 10; """
-                exception "unknown compression type(4)"
-            }
+                        "format" = "parquet"); """
 
 
             uri = "${defaultFS}" + 
"/user/doris/tvf_data/test_hdfs_parquet/group0/non_hadoop_lz4_compressed.parquet"
diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy 
b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
index 02bda4ec0dd..74cb1e320aa 100644
--- a/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
+++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_tvf.groovy
@@ -107,6 +107,14 @@ suite("test_hdfs_tvf","external,hive,tvf,external_docker") 
{
                             "uri" = "${uri}",
                             "hadoop.username" = "${hdfsUserName}",
                             "format" = "${format}") order by s_suppkey limit 
20; """
+            
+            // test parquet brotli
+            uri = "${defaultFS}" + 
"/user/doris/preinstalled_data/hdfs_tvf/test_parquet.brotli.parquet"
+            format = "parquet"
+            qt_parquet_brotli """ select * from HDFS(
+                            "uri" = "${uri}",
+                            "hadoop.username" = "${hdfsUserName}",
+                            "format" = "${format}") order by s_suppkey limit 
20; """
 
             // test parquet decimal256
             uri = "${defaultFS}" + 
"/user/doris/preinstalled_data/hdfs_tvf/test_parquet_decimal256.parquet"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to