This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new c84b56140c2 [Fix](outfile) Add a configuration for exporting data in 
Parquet format using `select into outfile` (#36143)
c84b56140c2 is described below

commit c84b56140c2d18a329d8885910d53bac64e9df47
Author: Tiewei Fang <43782773+bepppo...@users.noreply.github.com>
AuthorDate: Thu Jun 13 11:49:46 2024 +0800

    [Fix](outfile) Add a configuration for exporting data in Parquet format 
using `select into outfile` (#36143)
    
    backport: #36142
---
 be/src/common/config.cpp                    | 3 +++
 be/src/common/config.h                      | 3 +++
 be/src/vec/runtime/vparquet_transformer.cpp | 6 +++---
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index d090b3a451a..e6e5e85f96a 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1230,6 +1230,9 @@ DEFINE_mBool(skip_loading_stale_rowset_meta, "false");
 
 DEFINE_Bool(enable_file_logger, "true");
 
+// The minimum row group size when exporting Parquet files. default 128MB
+DEFINE_Int64(min_row_group_size, "134217728");
+
 // The time out milliseconds for remote fetch schema RPC, default 60s
 DEFINE_mInt64(fetch_remote_schema_rpc_timeout_ms, "60000");
 
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 61e0e2673b7..865d23000f5 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1316,6 +1316,9 @@ DECLARE_Bool(enable_file_logger);
 // The time out milliseconds for remote fetch schema RPC
 DECLARE_mInt64(fetch_remote_schema_rpc_timeout_ms);
 
+// The minimum row group size when exporting Parquet files.
+DECLARE_Int64(min_row_group_size);
+
 #ifdef BE_TEST
 // test s3
 DECLARE_String(test_s3_resource);
diff --git a/be/src/vec/runtime/vparquet_transformer.cpp 
b/be/src/vec/runtime/vparquet_transformer.cpp
index 77068adeebe..fbb24e00968 100644
--- a/be/src/vec/runtime/vparquet_transformer.cpp
+++ b/be/src/vec/runtime/vparquet_transformer.cpp
@@ -35,6 +35,7 @@
 #include <ostream>
 #include <string>
 
+#include "common/config.h"
 #include "common/status.h"
 #include "gutil/endian.h"
 #include "io/fs/file_writer.h"
@@ -70,8 +71,6 @@
 
 namespace doris::vectorized {
 
-const uint64_t min_row_group_size = 128 * 1024 * 1024; // 128MB
-
 ParquetOutputStream::ParquetOutputStream(doris::io::FileWriter* file_writer)
         : _file_writer(file_writer), _cur_pos(0), _written_len(0) {
     set_mode(arrow::io::FileMode::WRITE);
@@ -247,6 +246,7 @@ Status VParquetTransformer::_parse_properties() {
         }
         builder.created_by(
                 fmt::format("{}({})", doris::get_short_version(), 
parquet::DEFAULT_CREATED_BY));
+        builder.max_row_group_length(std::numeric_limits<int64_t>::max());
         _parquet_writer_properties = builder.build();
         _arrow_properties = parquet::ArrowWriterProperties::Builder()
                                     .enable_deprecated_int96_timestamps()
@@ -296,7 +296,7 @@ Status VParquetTransformer::write(const Block& block) {
 
     RETURN_DORIS_STATUS_IF_ERROR(_writer->WriteRecordBatch(*result));
     _write_size += block.bytes();
-    if (_write_size >= min_row_group_size) {
+    if (_write_size >= doris::config::min_row_group_size) {
         RETURN_DORIS_STATUS_IF_ERROR(_writer->NewBufferedRowGroup());
         _write_size = 0;
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to