This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new c84b56140c2 [Fix](outfile) Add a configuration for exporting data in Parquet format using `select into outfile` (#36143) c84b56140c2 is described below commit c84b56140c2d18a329d8885910d53bac64e9df47 Author: Tiewei Fang <43782773+bepppo...@users.noreply.github.com> AuthorDate: Thu Jun 13 11:49:46 2024 +0800 [Fix](outfile) Add a configuration for exporting data in Parquet format using `select into outfile` (#36143) backport: #36142 --- be/src/common/config.cpp | 3 +++ be/src/common/config.h | 3 +++ be/src/vec/runtime/vparquet_transformer.cpp | 6 +++--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index d090b3a451a..e6e5e85f96a 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1230,6 +1230,9 @@ DEFINE_mBool(skip_loading_stale_rowset_meta, "false"); DEFINE_Bool(enable_file_logger, "true"); +// The minimum row group size when exporting Parquet files. default 128MB +DEFINE_Int64(min_row_group_size, "134217728"); + // The time out milliseconds for remote fetch schema RPC, default 60s DEFINE_mInt64(fetch_remote_schema_rpc_timeout_ms, "60000"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 61e0e2673b7..865d23000f5 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1316,6 +1316,9 @@ DECLARE_Bool(enable_file_logger); // The time out milliseconds for remote fetch schema RPC DECLARE_mInt64(fetch_remote_schema_rpc_timeout_ms); +// The minimum row group size when exporting Parquet files. +DECLARE_Int64(min_row_group_size); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/vec/runtime/vparquet_transformer.cpp b/be/src/vec/runtime/vparquet_transformer.cpp index 77068adeebe..fbb24e00968 100644 --- a/be/src/vec/runtime/vparquet_transformer.cpp +++ b/be/src/vec/runtime/vparquet_transformer.cpp @@ -35,6 +35,7 @@ #include <ostream> #include <string> +#include "common/config.h" #include "common/status.h" #include "gutil/endian.h" #include "io/fs/file_writer.h" @@ -70,8 +71,6 @@ namespace doris::vectorized { -const uint64_t min_row_group_size = 128 * 1024 * 1024; // 128MB - ParquetOutputStream::ParquetOutputStream(doris::io::FileWriter* file_writer) : _file_writer(file_writer), _cur_pos(0), _written_len(0) { set_mode(arrow::io::FileMode::WRITE); @@ -247,6 +246,7 @@ Status VParquetTransformer::_parse_properties() { } builder.created_by( fmt::format("{}({})", doris::get_short_version(), parquet::DEFAULT_CREATED_BY)); + builder.max_row_group_length(std::numeric_limits<int64_t>::max()); _parquet_writer_properties = builder.build(); _arrow_properties = parquet::ArrowWriterProperties::Builder() .enable_deprecated_int96_timestamps() @@ -296,7 +296,7 @@ Status VParquetTransformer::write(const Block& block) { RETURN_DORIS_STATUS_IF_ERROR(_writer->WriteRecordBatch(*result)); _write_size += block.bytes(); - if (_write_size >= min_row_group_size) { + if (_write_size >= doris::config::min_row_group_size) { RETURN_DORIS_STATUS_IF_ERROR(_writer->NewBufferedRowGroup()); _write_size = 0; } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org