This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new 59e6fdf73b [improve](outfile) add file_suffix options for outfile (#24334) (#24431) 59e6fdf73b is described below commit 59e6fdf73b35dc3f74281d5e4d4e3ad3dc74004f Author: wudi <676366...@qq.com> AuthorDate: Fri Sep 15 14:21:06 2023 +0800 [improve](outfile) add file_suffix options for outfile (#24334) (#24431) --- be/src/vec/runtime/vfile_result_writer.cpp | 4 +- be/src/vec/sink/vresult_sink.h | 2 + .../Data-Manipulation-Statements/OUTFILE.md | 1 + .../Data-Manipulation-Statements/OUTFILE.md | 1 + .../org/apache/doris/analysis/OutFileClause.java | 10 +++ gensrc/thrift/DataSinks.thrift | 1 + .../data/export_p0/test_outfile_file_suffix.out | 4 ++ .../export_p0/test_outfile_file_suffix.groovy | 72 ++++++++++++++++++++++ 8 files changed, 94 insertions(+), 1 deletion(-) diff --git a/be/src/vec/runtime/vfile_result_writer.cpp b/be/src/vec/runtime/vfile_result_writer.cpp index 9d5fc4e158..4a215d7898 100644 --- a/be/src/vec/runtime/vfile_result_writer.cpp +++ b/be/src/vec/runtime/vfile_result_writer.cpp @@ -181,9 +181,11 @@ Status VFileResultWriter::_create_file_writer(const std::string& file_name) { // file name format as: my_prefix_{fragment_instance_id}_0.csv Status VFileResultWriter::_get_next_file_name(std::string* file_name) { + std::string suffix = + _file_opts->file_suffix.empty() ? _file_format_to_name() : _file_opts->file_suffix; std::stringstream ss; ss << _file_opts->file_path << print_id(_fragment_instance_id) << "_" << (_file_idx++) << "." - << _file_format_to_name(); + << suffix; *file_name = ss.str(); _header_sent = false; if (_storage_type == TStorageBackendType::LOCAL) { diff --git a/be/src/vec/sink/vresult_sink.h b/be/src/vec/sink/vresult_sink.h index de1126b2e1..19cfb3e3b4 100644 --- a/be/src/vec/sink/vresult_sink.h +++ b/be/src/vec/sink/vresult_sink.h @@ -70,6 +70,7 @@ struct ResultFileOptions { std::string orc_schema; bool delete_existing_files = false; + std::string file_suffix; ResultFileOptions(const TResultFileSinkOptions& t_opt) { file_path = t_opt.file_path; @@ -80,6 +81,7 @@ struct ResultFileOptions { t_opt.__isset.max_file_size_bytes ? t_opt.max_file_size_bytes : max_file_size_bytes; delete_existing_files = t_opt.__isset.delete_existing_files ? t_opt.delete_existing_files : false; + file_suffix = t_opt.file_suffix; is_local_file = true; if (t_opt.__isset.broker_addresses) { diff --git a/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md b/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md index 48bf78c945..e42fad7b47 100644 --- a/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md +++ b/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md @@ -79,6 +79,7 @@ illustrate: line_delimiter: line delimiter,is only for CSV format <version since="1.2.0">Support mulit-bytes, such as: "\\x01", "abc"</version> max_file_size: the size limit of a single file, if the result exceeds this value, it will be cut into multiple files, the value range of max_file_size is [5MB, 2GB] and the default is 1GB. (When specified that the file format is ORC, the size of the actual division file will be a multiples of 64MB, such as: specify max_file_size = 5MB, and actually use 64MB as the division; specify max_file_size = 65MB, and will actually use 128MB as cut division points.) delete_existing_files: default `false`. If it is specified as true, you will first delete all files specified in the directory specified by the file_path, and then export the data to the directory.For example: "file_path" = "/user/tmp", then delete all files and directory under "/user/"; "file_path" = "/user/tmp/", then delete all files and directory under "/user/tmp/" + file_suffix: Specify the suffix of the export file Broker related properties need to be prefixed with `broker.`: broker.name: broker name diff --git a/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md b/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md index 6e72184c47..d9dbf95d8b 100644 --- a/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md +++ b/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/OUTFILE.md @@ -84,6 +84,7 @@ INTO OUTFILE "file_path" line_delimiter: 行分隔符,只支持csv格式。<version since="1.2.0">支持多字节分隔符,如:"\\x01", "abc"</version> max_file_size: 单个文件大小限制,如果结果超过这个值,将切割成多个文件, max_file_size取值范围是[5MB, 2GB], 默认为1GB。(当指定导出为orc文件格式时,实际切分文件的大小将是64MB的倍数,如:指定max_file_size = 5MB, 实际将以64MB为切分;指定max_file_size = 65MB, 实际将以128MB为切分) delete_existing_files: 默认为false,若指定为true,则会先删除file_path指定的目录下的所有文件,然后导出数据到该目录下。例如:"file_path" = "/user/tmp", 则会删除"/user/"下所有文件及目录;"file_path" = "/user/tmp/", 则会删除"/user/tmp/"下所有文件及目录 + file_suffix: 指定导出文件的后缀 Broker 相关属性需加前缀 `broker.`: broker.name: broker名称 diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java index 24ed977a7d..742e1636fd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java @@ -139,6 +139,8 @@ public class OutFileClause { public static final String PROP_MAX_FILE_SIZE = "max_file_size"; private static final String PROP_SUCCESS_FILE_NAME = "success_file_name"; public static final String PROP_DELETE_EXISTING_FILES = "delete_existing_files"; + public static final String PROP_FILE_SUFFIX = "file_suffix"; + private static final String PARQUET_PROP_PREFIX = "parquet."; private static final String SCHEMA = "schema"; @@ -156,6 +158,7 @@ public class OutFileClause { private TFileFormatType fileFormatType; private long maxFileSizeBytes = DEFAULT_MAX_FILE_SIZE_BYTES; private boolean deleteExistingFiles = false; + private String fileSuffix = ""; private BrokerDesc brokerDesc = null; // True if result is written to local disk. // If set to true, the brokerDesc must be null. @@ -643,6 +646,11 @@ public class OutFileClause { processedPropKeys.add(PROP_DELETE_EXISTING_FILES); } + if (properties.containsKey(PROP_FILE_SUFFIX)) { + fileSuffix = properties.get(PROP_FILE_SUFFIX); + processedPropKeys.add(PROP_FILE_SUFFIX); + } + if (properties.containsKey(PROP_SUCCESS_FILE_NAME)) { successFileName = properties.get(PROP_SUCCESS_FILE_NAME); FeNameFormat.checkCommonName("file name", successFileName); @@ -880,6 +888,8 @@ public class OutFileClause { } sinkOptions.setMaxFileSizeBytes(maxFileSizeBytes); sinkOptions.setDeleteExistingFiles(deleteExistingFiles); + sinkOptions.setFileSuffix(fileSuffix); + if (brokerDesc != null) { sinkOptions.setBrokerProperties(brokerDesc.getProperties()); // broker_addresses of sinkOptions will be set in Coordinator. diff --git a/gensrc/thrift/DataSinks.thrift b/gensrc/thrift/DataSinks.thrift index c78a7900a9..b8826eaa1d 100644 --- a/gensrc/thrift/DataSinks.thrift +++ b/gensrc/thrift/DataSinks.thrift @@ -124,6 +124,7 @@ struct TResultFileSinkOptions { 15: optional string orc_schema 16: optional bool delete_existing_files; + 17: optional string file_suffix; } struct TMemoryScratchSink { diff --git a/regression-test/data/export_p0/test_outfile_file_suffix.out b/regression-test/data/export_p0/test_outfile_file_suffix.out new file mode 100644 index 0000000000..c67bc6748f --- /dev/null +++ b/regression-test/data/export_p0/test_outfile_file_suffix.out @@ -0,0 +1,4 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +zhangsan + diff --git a/regression-test/suites/export_p0/test_outfile_file_suffix.groovy b/regression-test/suites/export_p0/test_outfile_file_suffix.groovy new file mode 100644 index 0000000000..30f9fea23d --- /dev/null +++ b/regression-test/suites/export_p0/test_outfile_file_suffix.groovy @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_outfile_file_suffix", "p0") { + // open nereids + sql """ set enable_nereids_planner=true """ + sql """ set enable_fallback_to_original_planner=false """ + + String ak = getS3AK() + String sk = getS3SK() + String s3_endpoint = getS3Endpoint() + String region = getS3Region() + String bucket = context.config.otherConfigs.get("s3BucketName"); + + def create_table = {table_name -> + sql """ DROP TABLE IF EXISTS ${table_name} """ + sql """ + CREATE TABLE IF NOT EXISTS ${table_name} ( + `name` varchar(128) NOT NULL COMMENT "" + ) + DISTRIBUTED BY HASH(name) PROPERTIES("replication_num" = "1"); + """ + sql """ INSERT INTO ${table_name} values('zhangsan');""" + } + + def table_name = "test_outfile_file_suffix" + create_table(table_name) + + def outFilePath = """s3://${bucket}/outfile_""" + def csv_suffix_result = { file_suffix, file_format -> + result = sql """ + select * from ${table_name} + into outfile "${outFilePath}" + FORMAT AS ${file_format} + PROPERTIES( + "s3.endpoint" = "${s3_endpoint}", + "s3.region" = "${region}", + "s3.secret_key"="${sk}", + "s3.access_key" = "${ak}", + "file_suffix" = "${file_suffix}" + ); + """ + return result[0][3] + } + + def file_suffix = "txt"; + def file_format = "csv"; + def outfile_url = csv_suffix_result(file_suffix, file_format); + print("http://${s3_endpoint}${outfile_url.substring(4)}0.${file_suffix}") + qt_select """ select * from s3( + "uri" = "http://${s3_endpoint}${outfile_url.substring(4)}0.${file_suffix}", + "ACCESS_KEY"= "${ak}", + "SECRET_KEY" = "${sk}", + "format" = "${file_format}", + "region" = "${region}" + ); + """ +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org