This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 0838ff4bf4 [fix](Outfile) fix bug that the `fileSize` is not correct when outfile is completed (#22951) 0838ff4bf4 is described below commit 0838ff4bf45cf5ec58b5cd10f0457a6b0935c57d Author: Tiewei Fang <43782773+bepppo...@users.noreply.github.com> AuthorDate: Fri Aug 18 22:31:44 2023 +0800 [fix](Outfile) fix bug that the `fileSize` is not correct when outfile is completed (#22951) --- be/src/vec/runtime/vfile_result_writer.cpp | 6 +- .../Data-Manipulation-Statements/OUTFILE.md | 2 +- .../Data-Manipulation-Statements/OUTFILE.md | 2 +- .../test_outfile_orc_max_file_size.groovy | 115 +++++++++++++++++++++ 4 files changed, 122 insertions(+), 3 deletions(-) diff --git a/be/src/vec/runtime/vfile_result_writer.cpp b/be/src/vec/runtime/vfile_result_writer.cpp index 8977cd0c47..1d5ed6ce1b 100644 --- a/be/src/vec/runtime/vfile_result_writer.cpp +++ b/be/src/vec/runtime/vfile_result_writer.cpp @@ -467,7 +467,11 @@ Status VFileResultWriter::_create_new_file_if_exceed_size() { Status VFileResultWriter::_close_file_writer(bool done) { if (_vfile_writer) { _vfile_writer->close(); - COUNTER_UPDATE(_written_data_bytes, _current_written_bytes); + // we can not use _current_written_bytes to COUNTER_UPDATE(_written_data_bytes, _current_written_bytes) + // because it will call `write()` function of orc/parquet function in `_vfile_writer->close()` + // and the real written_len will increase + // and _current_written_bytes will less than _vfile_writer->written_len() + COUNTER_UPDATE(_written_data_bytes, _vfile_writer->written_len()); _vfile_writer.reset(nullptr); } else if (_file_writer_impl) { _file_writer_impl->close(); diff --git a/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md b/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md index 0b87f9bc9c..48bf78c945 100644 --- a/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md +++ b/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md @@ -77,7 +77,7 @@ illustrate: File related properties column_separator: column separator,is only for CSV format <version since="1.2.0">Support mulit-bytes, such as: "\\x01", "abc"</version> line_delimiter: line delimiter,is only for CSV format <version since="1.2.0">Support mulit-bytes, such as: "\\x01", "abc"</version> - max_file_size: the size limit of a single file, if the result exceeds this value, it will be cut into multiple files. + max_file_size: the size limit of a single file, if the result exceeds this value, it will be cut into multiple files, the value range of max_file_size is [5MB, 2GB] and the default is 1GB. (When specified that the file format is ORC, the size of the actual division file will be a multiples of 64MB, such as: specify max_file_size = 5MB, and actually use 64MB as the division; specify max_file_size = 65MB, and will actually use 128MB as cut division points.) delete_existing_files: default `false`. If it is specified as true, you will first delete all files specified in the directory specified by the file_path, and then export the data to the directory.For example: "file_path" = "/user/tmp", then delete all files and directory under "/user/"; "file_path" = "/user/tmp/", then delete all files and directory under "/user/tmp/" Broker related properties need to be prefixed with `broker.`: diff --git a/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md b/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md index ff650047b2..6e72184c47 100644 --- a/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md +++ b/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md @@ -82,7 +82,7 @@ INTO OUTFILE "file_path" 文件相关的属性 column_separator: 列分隔符,只支持csv格式。<version since="1.2.0">支持多字节分隔符,如:"\\x01", "abc"</version> line_delimiter: 行分隔符,只支持csv格式。<version since="1.2.0">支持多字节分隔符,如:"\\x01", "abc"</version> - max_file_size: 单个文件大小限制,如果结果超过这个值,将切割成多个文件。 + max_file_size: 单个文件大小限制,如果结果超过这个值,将切割成多个文件, max_file_size取值范围是[5MB, 2GB], 默认为1GB。(当指定导出为orc文件格式时,实际切分文件的大小将是64MB的倍数,如:指定max_file_size = 5MB, 实际将以64MB为切分;指定max_file_size = 65MB, 实际将以128MB为切分) delete_existing_files: 默认为false,若指定为true,则会先删除file_path指定的目录下的所有文件,然后导出数据到该目录下。例如:"file_path" = "/user/tmp", 则会删除"/user/"下所有文件及目录;"file_path" = "/user/tmp/", 则会删除"/user/tmp/"下所有文件及目录 Broker 相关属性需加前缀 `broker.`: diff --git a/regression-test/suites/export_p2/test_outfile_orc_max_file_size.groovy b/regression-test/suites/export_p2/test_outfile_orc_max_file_size.groovy new file mode 100644 index 0000000000..71c5980d1b --- /dev/null +++ b/regression-test/suites/export_p2/test_outfile_orc_max_file_size.groovy @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_outfile_orc_max_file_size", "p2") { + String nameNodeHost = context.config.otherConfigs.get("extHiveHmsHost") + String hdfsPort = context.config.otherConfigs.get("extHdfsPort") + String fs = "hdfs://${nameNodeHost}:${hdfsPort}" + String user_name = context.config.otherConfigs.get("extHiveHmsUser") + + // the path used to load data + def load_data_path = "/user/export_test/test_orc_max_file_size.orc" + // the path used to export data + def outFilePath = """/user/export_test/test_max_file_size/test_orc/exp_""" + + def create_table = {table_name -> + sql """ DROP TABLE IF EXISTS ${table_name} """ + sql """ + CREATE TABLE IF NOT EXISTS ${table_name} ( + `user_id` LARGEINT NOT NULL COMMENT "用户id", + `date` DATE NOT NULL COMMENT "数据灌入日期时间", + `datetime` DATETIME NOT NULL COMMENT "数据灌入日期时间", + `city` VARCHAR(20) COMMENT "用户所在城市", + `age` INT COMMENT "用户年龄", + `sex` INT COMMENT "用户性别", + `bool_col` boolean COMMENT "", + `int_col` int COMMENT "", + `bigint_col` bigint COMMENT "", + `largeint_col` largeint COMMENT "", + `float_col` float COMMENT "", + `double_col` double COMMENT "", + `char_col` CHAR(10) COMMENT "", + `decimal_col` decimal COMMENT "" + ) + DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1"); + """ + } + + def table_export_name = "test_export_max_file_size" + + create_table(table_export_name) + + // load data + sql """ + insert into ${table_export_name} + select * from hdfs( + "uri" = "hdfs://${nameNodeHost}:${hdfsPort}${load_data_path}", + "fs.defaultFS" = "${fs}", + "hadoop.username" = "${user_name}", + "format" = "orc"); + """ + + def test_outfile_orc_success = {maxFileSize, isDelete, fileNumber, totalRows -> + table = sql """ + select * from ${table_export_name} + into outfile "${fs}${outFilePath}" + FORMAT AS ORC + PROPERTIES( + "fs.defaultFS"="${fs}", + "hadoop.username" = "${user_name}", + "max_file_size" = "${maxFileSize}", + "delete_existing_files"="${isDelete}" + ); + """ + assertTrue(table.size() == 1) + assertTrue(table[0].size == 4) + log.info("outfile result = " + table[0]) + assertEquals(table[0][0], fileNumber) + assertEquals(table[0][1], totalRows) + } + + def test_outfile_orc_fail = {maxFileSize, isDelete -> + test { + sql """ + select * from ${table_export_name} + into outfile "${fs}${outFilePath}" + FORMAT AS ORC + PROPERTIES( + "fs.defaultFS"="${fs}", + "hadoop.username" = "${user_name}", + "max_file_size" = "${maxFileSize}", + "delete_existing_files"="${isDelete}" + ); + """ + + // other check will not work because already declared a check callback + exception "max file size should between 5MB and 2GB" + + // callback + check { result, exception, startTime, endTime -> + assertTrue(exception != null) + } + } + } + + test_outfile_orc_fail('3MB', true) + test_outfile_orc_fail('2.1GB', true) + test_outfile_orc_success('5MB', true, 3, 2000000) + test_outfile_orc_success('63MB', true, 3, 2000000) + test_outfile_orc_success('64MB', true, 3, 2000000) + test_outfile_orc_success('80MB', true, 2, 2000000) +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org