This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new cb07d5004db branch-3.1: [fix](outfile) fix small file output with bz2
compression #56368 (#57041)
cb07d5004db is described below
commit cb07d5004dba902722a703da4000ef9ff5edb753
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Thu Oct 16 23:33:26 2025 +0800
branch-3.1: [fix](outfile) fix small file output with bz2 compression
#56368 (#57041)
bp #56368
---
be/src/util/block_compression.cpp | 5 +-
.../data/export_p0/test_outfile_csv_compress.out | 60 ++++++++++++++++++++++
.../export_p0/test_outfile_csv_compress.groovy | 47 +++++++++++++++++
.../hive/test_hive_get_schema_from_table.groovy | 2 +-
4 files changed, 112 insertions(+), 2 deletions(-)
diff --git a/be/src/util/block_compression.cpp
b/be/src/util/block_compression.cpp
index d1788b0948a..04ea339a09f 100644
--- a/be/src/util/block_compression.cpp
+++ b/be/src/util/block_compression.cpp
@@ -981,7 +981,10 @@ public:
size_t max_compressed_len(size_t len) override {
// TODO: make sure the max_compressed_len for bzip2
- return len * 2;
+ // 50 is an estimate fix overhead for bzip2
+ // in case the input len is small and BZ2_bzBuffToBuffCompress will
return
+ // BZ_OUTBUFF_FULL
+ return len * 2 + 50;
}
};
diff --git a/regression-test/data/export_p0/test_outfile_csv_compress.out
b/regression-test/data/export_p0/test_outfile_csv_compress.out
index 48ae4946778..7d3965e8974 100644
--- a/regression-test/data/export_p0/test_outfile_csv_compress.out
+++ b/regression-test/data/export_p0/test_outfile_csv_compress.out
@@ -113,6 +113,66 @@ c2 text Yes false \N NONE
c1 text Yes false \N NONE
c2 text Yes false \N NONE
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
+-- !select --
+1 2
+
+-- !select --
+1 1
+
+-- !select --
+c1 text Yes false \N NONE
+c2 text Yes false \N NONE
+
-- !select --
__dummy_col text Yes false \N NONE
diff --git a/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
b/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
index 6bdbb39fe75..01e5f066440 100644
--- a/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
+++ b/regression-test/suites/export_p0/test_outfile_csv_compress.groovy
@@ -39,6 +39,17 @@ suite("test_outfile_csv_compress", "p0") {
for (int i = 0; i < 20; i++) {
sql """ insert into ${table_name} select id + ${i}, concat(name,
id + ${i}) from ${table_name};"""
}
+
+ // small table
+ sql """ DROP TABLE IF EXISTS small_${table_name} """
+ sql """
+ CREATE TABLE IF NOT EXISTS small_${table_name} (
+ `id` int,
+ `name` int
+ )
+ DISTRIBUTED BY HASH(name) PROPERTIES("replication_num" = "1");
+ """
+ sql """INSERT INTO small_${table_name} values(1, 2);"""
}
def table_name = "test_outfile_csv_compress"
@@ -96,6 +107,42 @@ suite("test_outfile_csv_compress", "p0") {
"""
}
+ for (String compression_type: ["plain", "gz", "bz2", "snappyblock",
"lz4block", "zstd"]) {
+ def small = "small_${table_name}"
+ def outfile_url = csv_outfile_result(small, compression_type);
+ print("http://${bucket}.${s3_endpoint}${outfile_url.substring(5 +
bucket.length(), outfile_url.length() - 1)}0.")
+ qt_select """ select c1, c2 from s3(
+ "uri" =
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(),
outfile_url.length() - 1)}*",
+ "ACCESS_KEY"= "${ak}",
+ "SECRET_KEY" = "${sk}",
+ "format" = "csv",
+ "provider" = "${getS3Provider()}",
+ "region" = "${region}",
+ "compress_type" = "${compression_type}"
+ ) order by c1, c2 limit 10;
+ """
+ qt_select """ select count(c1), count(c2) from s3(
+ "uri" =
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(),
outfile_url.length() - 1)}*",
+ "ACCESS_KEY"= "${ak}",
+ "SECRET_KEY" = "${sk}",
+ "format" = "csv",
+ "provider" = "${getS3Provider()}",
+ "region" = "${region}",
+ "compress_type" = "${compression_type}"
+ );
+ """
+ qt_select """desc function s3(
+ "uri" =
"http://${bucket}.${s3_endpoint}${outfile_url.substring(5 + bucket.length(),
outfile_url.length() - 1)}*",
+ "ACCESS_KEY"= "${ak}",
+ "SECRET_KEY" = "${sk}",
+ "format" = "csv",
+ "provider" = "${getS3Provider()}",
+ "region" = "${region}",
+ "compress_type" = "${compression_type}"
+ );
+ """
+ }
+
// test invalid compression_type
test {
sql """
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
index 5d6f78b6ae0..3f72f8ef974 100644
---
a/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
+++
b/regression-test/suites/external_table_p0/hive/test_hive_get_schema_from_table.groovy
@@ -47,7 +47,7 @@ suite("test_hive_get_schema_from_table",
"external_docker,hive,external_docker_h
log.info("database = ${res_dbs_log[i][0]} => tables = " +
tbs.toString())
}
- order_qt_schema_1 """select * from
${catalog_name}.${ex_db_name}.parquet_partition_table order by
l_orderkey,l_partkey limit 1;"""
+ order_qt_schema_1 """select * from
${catalog_name}.${ex_db_name}.parquet_partition_table order by l_orderkey,
l_partkey limit 1;"""
order_qt_schema_2 """select * from
${catalog_name}.${ex_db_name}.parquet_delta_binary_packed order by int_value
limit 1;"""
order_qt_schema_3 """select * from
${catalog_name}.${ex_db_name}.parquet_alltypes_tiny_pages order by id desc
limit 5;"""
order_qt_schema_4 """select * from
${catalog_name}.${ex_db_name}.orc_all_types_partition order by bigint_col desc
limit 3;"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]