This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 210b0ecc225 [Enhencement](Outfile/Export) Export data to csv file format with BOM (#30533) 210b0ecc225 is described below commit 210b0ecc225302d2fb299c701f8575deb9c0877f Author: Tiewei Fang <43782773+bepppo...@users.noreply.github.com> AuthorDate: Fri Feb 9 17:15:55 2024 +0800 [Enhencement](Outfile/Export) Export data to csv file format with BOM (#30533) The UTF8 format of the Windows system has BOM. We add a new user property to `Outfile/Export`。Therefore, when exporting Doris data, users can choose whether to bring BOM on the beginning of the CSV file. **Usage:** ```sql -- outfile: select * from demo.student into outfile "file:///xxx/export/exp_" format as csv properties( "column_separator" = ",", "with_bom" = "true" ); -- Export: EXPORT TABLE student TO "file:///xx/tmpdata/export/exp_" PROPERTIES( "format" = "csv", "with_bom" = "true" ); ``` --- be/src/vec/exec/format/csv/csv_reader.cpp | 35 ++- be/src/vec/exec/format/csv/csv_reader.h | 5 + be/src/vec/runtime/vcsv_transformer.cpp | 11 +- be/src/vec/runtime/vcsv_transformer.h | 5 +- be/src/vec/sink/vresult_sink.h | 3 + be/src/vec/sink/writer/vfile_result_writer.cpp | 7 +- .../Manipulation/EXPORT.md | 11 + .../Manipulation/EXPORT.md | 2 + .../java/org/apache/doris/analysis/ExportStmt.java | 5 + .../org/apache/doris/analysis/OutFileClause.java | 8 + .../main/java/org/apache/doris/load/ExportJob.java | 4 + .../main/java/org/apache/doris/load/ExportMgr.java | 1 + .../org/apache/doris/load/ExportTaskExecutor.java | 6 +- .../trees/plans/commands/ExportCommand.java | 4 + gensrc/thrift/DataSinks.thrift | 1 + regression-test/data/export_p0/test_with_bom.out | 66 ++++++ .../suites/export_p0/test_with_bom.groovy | 241 +++++++++++++++++++++ 17 files changed, 406 insertions(+), 9 deletions(-) diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 84c8d911f94..86986f8eea5 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -462,13 +462,24 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { size_t rows = 0; bool success = false; + bool is_remove_bom = false; if (_push_down_agg_type == TPushAggOp::type::COUNT) { while (rows < batch_size && !_line_reader_eof) { const uint8_t* ptr = nullptr; size_t size = 0; RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); + + // _skip_lines == 0 means this line is the actual data beginning line for the entire file + // is_remove_bom means _remove_bom should only execute once + if (_skip_lines == 0 && !is_remove_bom) { + ptr = _remove_bom(ptr, size); + is_remove_bom = true; + } + + // _skip_lines > 0 means we do not need to remove bom if (_skip_lines > 0) { _skip_lines--; + is_remove_bom = true; continue; } if (size == 0) { @@ -490,8 +501,18 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { const uint8_t* ptr = nullptr; size_t size = 0; RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); + + // _skip_lines == 0 means this line is the actual data beginning line for the entire file + // is_remove_bom means _remove_bom should only execute once + if (!is_remove_bom && _skip_lines == 0) { + ptr = _remove_bom(ptr, size); + is_remove_bom = true; + } + + // _skip_lines > 0 means we do not remove bom if (_skip_lines > 0) { _skip_lines--; + is_remove_bom = true; continue; } if (size == 0) { @@ -538,10 +559,11 @@ Status CsvReader::get_parsed_schema(std::vector<std::string>* col_names, } else { // parse csv file with names RETURN_IF_ERROR(_parse_col_names(col_names)); } + for (size_t j = 0; j < col_names->size(); ++j) { col_types->emplace_back(TypeDescriptor::create_string_type()); } - } else { // parse csv file without names and types + } else { // parse csv file with names and types RETURN_IF_ERROR(_parse_col_names(col_names)); RETURN_IF_ERROR(_parse_col_types(col_names->size(), col_types)); } @@ -930,6 +952,7 @@ Status CsvReader::_parse_col_nums(size_t* col_nums) { if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) { return Status::InternalError<false>("Only support csv data in utf8 codec"); } + ptr = _remove_bom(ptr, size); _split_line(Slice(ptr, size)); *col_nums = _split_values.size(); return Status::OK(); @@ -946,6 +969,7 @@ Status CsvReader::_parse_col_names(std::vector<std::string>* col_names) { if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) { return Status::InternalError<false>("Only support csv data in utf8 codec"); } + ptr = _remove_bom(ptr, size); _split_line(Slice(ptr, size)); for (size_t idx = 0; idx < _split_values.size(); ++idx) { col_names->emplace_back(_split_values[idx].to_string()); @@ -969,6 +993,15 @@ Status CsvReader::_parse_col_types(size_t col_nums, std::vector<TypeDescriptor>* return Status::OK(); } +const uint8_t* CsvReader::_remove_bom(const uint8_t* ptr, size_t& size) { + if (size >= 3 && ptr[0] == 0xEF && ptr[1] == 0xBB && ptr[2] == 0xBF) { + LOG(INFO) << "remove bom"; + size -= 3; + return ptr + 3; + } + return ptr; +} + Status CsvReader::close() { if (_line_reader) { _line_reader->close(); diff --git a/be/src/vec/exec/format/csv/csv_reader.h b/be/src/vec/exec/format/csv/csv_reader.h index 19561b39eeb..d9c8633f427 100644 --- a/be/src/vec/exec/format/csv/csv_reader.h +++ b/be/src/vec/exec/format/csv/csv_reader.h @@ -230,6 +230,11 @@ private: // and the line is skipped as unqualified row, and the process should continue. Status _validate_line(const Slice& line, bool* success); + // If the CSV file is an UTF8 encoding with BOM, + // then remove the first 3 bytes at the beginning of this file + // and set size = size - 3. + const uint8_t* _remove_bom(const uint8_t* ptr, size_t& size); + RuntimeState* _state = nullptr; RuntimeProfile* _profile = nullptr; ScannerCounter* _counter = nullptr; diff --git a/be/src/vec/runtime/vcsv_transformer.cpp b/be/src/vec/runtime/vcsv_transformer.cpp index 6628f7743cc..0daba686d62 100644 --- a/be/src/vec/runtime/vcsv_transformer.cpp +++ b/be/src/vec/runtime/vcsv_transformer.cpp @@ -54,15 +54,18 @@ namespace doris::vectorized { +static const unsigned char bom[] = {0xEF, 0xBB, 0xBF}; + VCSVTransformer::VCSVTransformer(RuntimeState* state, doris::io::FileWriter* file_writer, const VExprContextSPtrs& output_vexpr_ctxs, bool output_object_data, std::string_view header_type, std::string_view header, std::string_view column_separator, - std::string_view line_delimiter) + std::string_view line_delimiter, bool with_bom) : VFileFormatTransformer(state, output_vexpr_ctxs, output_object_data), _column_separator(column_separator), _line_delimiter(line_delimiter), - _file_writer(file_writer) { + _file_writer(file_writer), + _with_bom(with_bom) { if (header.size() > 0) { _csv_header = header; if (header_type == BeConsts::CSV_WITH_NAMES_AND_TYPES) { @@ -74,6 +77,10 @@ VCSVTransformer::VCSVTransformer(RuntimeState* state, doris::io::FileWriter* fil } Status VCSVTransformer::open() { + if (_with_bom) { + RETURN_IF_ERROR( + _file_writer->append(Slice(reinterpret_cast<const char*>(bom), sizeof(bom)))); + } if (!_csv_header.empty()) { return _file_writer->append(Slice(_csv_header.data(), _csv_header.size())); } diff --git a/be/src/vec/runtime/vcsv_transformer.h b/be/src/vec/runtime/vcsv_transformer.h index 8821b857184..779b8e41377 100644 --- a/be/src/vec/runtime/vcsv_transformer.h +++ b/be/src/vec/runtime/vcsv_transformer.h @@ -43,7 +43,8 @@ public: VCSVTransformer(RuntimeState* state, doris::io::FileWriter* file_writer, const VExprContextSPtrs& output_vexpr_ctxs, bool output_object_data, std::string_view header_type, std::string_view header, - std::string_view column_separator, std::string_view line_delimiter); + std::string_view column_separator, std::string_view line_delimiter, + bool with_bom); ~VCSVTransformer() = default; @@ -71,6 +72,8 @@ private: // For example: bitmap_to_string() may return large volume of data. // And the speed is relative low, in my test, is about 6.5MB/s. fmt::memory_buffer _outstream_buffer; + + bool _with_bom = false; }; } // namespace doris::vectorized diff --git a/be/src/vec/sink/vresult_sink.h b/be/src/vec/sink/vresult_sink.h index 0dd69ee84cd..c5b092bcd41 100644 --- a/be/src/vec/sink/vresult_sink.h +++ b/be/src/vec/sink/vresult_sink.h @@ -70,6 +70,8 @@ struct ResultFileOptions { bool delete_existing_files = false; std::string file_suffix; + //Bring BOM when exporting to CSV format + bool with_bom = false; ResultFileOptions(const TResultFileSinkOptions& t_opt) { file_path = t_opt.file_path; @@ -81,6 +83,7 @@ struct ResultFileOptions { delete_existing_files = t_opt.__isset.delete_existing_files ? t_opt.delete_existing_files : false; file_suffix = t_opt.file_suffix; + with_bom = t_opt.with_bom; is_local_file = true; if (t_opt.__isset.broker_addresses) { diff --git a/be/src/vec/sink/writer/vfile_result_writer.cpp b/be/src/vec/sink/writer/vfile_result_writer.cpp index f7015e17c96..bf3d09cdda6 100644 --- a/be/src/vec/sink/writer/vfile_result_writer.cpp +++ b/be/src/vec/sink/writer/vfile_result_writer.cpp @@ -143,9 +143,10 @@ Status VFileResultWriter::_create_file_writer(const std::string& file_name) { _file_writer_impl)); switch (_file_opts->file_format) { case TFileFormatType::FORMAT_CSV_PLAIN: - _vfile_writer.reset(new VCSVTransformer( - _state, _file_writer_impl.get(), _vec_output_expr_ctxs, _output_object_data, - _header_type, _header, _file_opts->column_separator, _file_opts->line_delimiter)); + _vfile_writer.reset(new VCSVTransformer(_state, _file_writer_impl.get(), + _vec_output_expr_ctxs, _output_object_data, + _header_type, _header, _file_opts->column_separator, + _file_opts->line_delimiter, _file_opts->with_bom)); break; case TFileFormatType::FORMAT_PARQUET: _vfile_writer.reset(new VParquetTransformer( diff --git a/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md b/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md index 8eab419de6d..c6479d0e824 100644 --- a/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md +++ b/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md @@ -77,14 +77,25 @@ The bottom layer of the `Export` statement actually executes the `select...outfi The following parameters can be specified: - `label`: This parameter is optional, specifies the label of the export task. If this parameter is not specified, the system randomly assigns a label to the export task. + - `column_separator`: Specifies the exported column separator, default is `\t`, mulit-bytes is supported. This parameter is only used for `CSV` file format. + - `line_delimiter`: Specifies the line delimiter for export, the default is `\n`, mulit-bytes is supported. This parameter is only used for `CSV` file format. + - `timeout`: The timeout period of the export job, the default is 2 hours, the unit is seconds. + - `columns`: Specifies certain columns of the export job table + - `format`: Specifies the file format, support: parquet, orc, csv, csv_with_names, csv_with_names_and_types.The default is csv format. + - `parallelism`: The concurrency degree of the `export` job, the default is `1`. The export job will be divided into `select..outfile..` statements of the number of `parallelism` to execute concurrently. (If the value of `parallelism` is greater than the number of tablets in the table, the system will automatically set `parallelism` to the number of tablets, that is, each `select..outfile..` statement is responsible for one tablet) + - `delete_existing_files`: default `false`. If it is specified as true, you will first delete all files specified in the directory specified by the file_path, and then export the data to the directory.For example: "file_path" = "/user/tmp", then delete all files and directory under "/user/"; "file_path" = "/user/tmp/", then delete all files and directory under "/user/tmp/" + - `max_file_size`: it is the limit for the size of a single file in the export job. If the result file exceeds this value, it will be split into multiple files. The valid range for `max_file_size` is [5MB, 2GB], with a default value of 1GB. (When exporting to the ORC file format, the actual size of the split files will be multiples of 64MB, for example, if max_file_size is specified as 5MB, the actual split size will be 64MB; if max_file_size is specified as 65MB, the actual split size [...] + + - `with_bom`: The default is false. If it is set to true, the exported file is encoded in UTF8 with BOM (valid only for CSV-related file format). + - `timeout`: This is the timeout parameter of the export job, the default timeout is 2 hours, and the unit is seconds. > Note that to use the `delete_existing_files` parameter, you also need to add the configuration `enable_delete_existing_files = true` to the fe.conf file and restart the FE. Only then will the `delete_existing_files` parameter take effect. Setting `delete_existing_files = true` is a dangerous operation and it is recommended to only use it in a testing environment. diff --git a/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md b/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md index ca04b4c8a42..28021c8d7f0 100644 --- a/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md +++ b/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md @@ -90,6 +90,8 @@ EXPORT - `delete_existing_files`: 默认为false,若指定为true,则会先删除`export_path`所指定目录下的所有文件,然后导出数据到该目录下。例如:"export_path" = "/user/tmp", 则会删除"/user/"下所有文件及目录;"file_path" = "/user/tmp/", 则会删除"/user/tmp/"下所有文件及目录。 + - `with_bom`: 默认为false,若指定为true,则导出的文件编码为带有BOM的UTF8编码(只对csv相关的文件格式生效)。 + - `timeout`:导出作业的超时时间,默认为2小时,单位是秒。 > 注意:要使用delete_existing_files参数,还需要在fe.conf中添加配置`enable_delete_existing_files = true`并重启fe,此时delete_existing_files才会生效。delete_existing_files = true 是一个危险的操作,建议只在测试环境中使用。 diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java index 1b33630e802..123bfa727fb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java @@ -103,6 +103,7 @@ public class ExportStmt extends StatementBase { private String maxFileSize; private String deleteExistingFiles; + private String withBom; private SessionVariable sessionVariables; private String qualifiedUser; @@ -228,6 +229,7 @@ public class ExportStmt extends StatementBase { exportJob.setParallelism(this.parallelism); exportJob.setMaxFileSize(this.maxFileSize); exportJob.setDeleteExistingFiles(this.deleteExistingFiles); + exportJob.setWithBom(this.withBom); if (columns != null) { Splitter split = Splitter.on(',').trimResults().omitEmptyStrings(); @@ -354,6 +356,9 @@ public class ExportStmt extends StatementBase { // generate a random label this.label = "export_" + UUID.randomUUID(); } + + // with bom + this.withBom = properties.getOrDefault(OutFileClause.PROP_WITH_BOM, "false"); } private void checkColumns() throws DdlException { diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java index 236788ce328..945a8b07452 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java @@ -136,6 +136,7 @@ public class OutFileClause { private static final String PROP_SUCCESS_FILE_NAME = "success_file_name"; public static final String PROP_DELETE_EXISTING_FILES = "delete_existing_files"; public static final String PROP_FILE_SUFFIX = "file_suffix"; + public static final String PROP_WITH_BOM = "with_bom"; private static final String PARQUET_PROP_PREFIX = "parquet."; private static final String SCHEMA = "schema"; @@ -155,6 +156,7 @@ public class OutFileClause { private long maxFileSizeBytes = DEFAULT_MAX_FILE_SIZE_BYTES; private boolean deleteExistingFiles = false; private String fileSuffix = ""; + private boolean withBom = false; private BrokerDesc brokerDesc = null; // True if result is written to local disk. // If set to true, the brokerDesc must be null. @@ -566,6 +568,11 @@ public class OutFileClause { processedPropKeys.add(PROP_FILE_SUFFIX); } + if (properties.containsKey(PROP_WITH_BOM)) { + withBom = Boolean.valueOf(properties.get(PROP_WITH_BOM)).booleanValue(); + processedPropKeys.add(PROP_WITH_BOM); + } + if (properties.containsKey(PROP_SUCCESS_FILE_NAME)) { successFileName = properties.get(PROP_SUCCESS_FILE_NAME); FeNameFormat.checkOutfileSuccessFileName("file name", successFileName); @@ -805,6 +812,7 @@ public class OutFileClause { sinkOptions.setMaxFileSizeBytes(maxFileSizeBytes); sinkOptions.setDeleteExistingFiles(deleteExistingFiles); sinkOptions.setFileSuffix(fileSuffix); + sinkOptions.setWithBom(withBom); if (brokerDesc != null) { sinkOptions.setBrokerProperties(brokerDesc.getProperties()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java index 0ae513d1db6..f6da9a6a97a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java @@ -166,6 +166,8 @@ public class ExportJob implements Writable { @SerializedName("tabletsNum") private Integer tabletsNum; + @SerializedName("withBom") + private String withBom; private TableRef tableRef; @@ -219,6 +221,7 @@ public class ExportJob implements Writable { this.columnSeparator = "\t"; this.lineDelimiter = "\n"; this.columns = ""; + this.withBom = "false"; } public ExportJob(long jobId) { @@ -554,6 +557,7 @@ public class ExportJob implements Writable { if (!deleteExistingFiles.isEmpty()) { outfileProperties.put(OutFileClause.PROP_DELETE_EXISTING_FILES, deleteExistingFiles); } + outfileProperties.put(OutFileClause.PROP_WITH_BOM, withBom); // broker properties // outfile clause's broker properties need 'broker.' prefix diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java b/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java index 90be7f9f10e..efbfd33966f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java @@ -349,6 +349,7 @@ public class ExportMgr { infoMap.put("broker", job.getBrokerDesc().getName()); infoMap.put("column_separator", job.getColumnSeparator()); infoMap.put("format", job.getFormat()); + infoMap.put("with_bom", job.getWithBom()); infoMap.put("line_delimiter", job.getLineDelimiter()); infoMap.put("columns", job.getColumns()); infoMap.put("tablet_num", job.getTabletsNum()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/ExportTaskExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/load/ExportTaskExecutor.java index 26e873afd80..3f7c17fef68 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/ExportTaskExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/ExportTaskExecutor.java @@ -40,7 +40,6 @@ import org.apache.doris.scheduler.executor.TransientTaskExecutor; import org.apache.doris.thrift.TUniqueId; import com.google.common.collect.Lists; -import lombok.Setter; import lombok.extern.slf4j.Slf4j; import java.util.List; @@ -56,7 +55,6 @@ public class ExportTaskExecutor implements TransientTaskExecutor { ExportJob exportJob; - @Setter Long taskId; private StmtExecutor stmtExecutor; @@ -205,4 +203,8 @@ public class ExportTaskExecutor implements TransientTaskExecutor { } return Optional.empty(); } + + public void setTaskId(Long taskId) { + this.taskId = taskId; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExportCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExportCommand.java index 6d4fb22282c..b43fc9348ce 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExportCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExportCommand.java @@ -88,6 +88,7 @@ public class ExportCommand extends Command implements ForwardWithSync { .add(PropertyAnalyzer.PROPERTIES_LINE_DELIMITER) .add(PropertyAnalyzer.PROPERTIES_TIMEOUT) .add("format") + .add(OutFileClause.PROP_WITH_BOM) .build(); private final List<String> nameParts; @@ -267,6 +268,9 @@ public class ExportCommand extends Command implements ForwardWithSync { exportJob.setFormat(fileProperties.getOrDefault(LoadStmt.KEY_IN_PARAM_FORMAT_TYPE, "csv") .toLowerCase()); + // set withBom + exportJob.setWithBom(fileProperties.getOrDefault(OutFileClause.PROP_WITH_BOM, "false")); + // set parallelism int parallelism; try { diff --git a/gensrc/thrift/DataSinks.thrift b/gensrc/thrift/DataSinks.thrift index f9e26e7a47c..068b5927004 100644 --- a/gensrc/thrift/DataSinks.thrift +++ b/gensrc/thrift/DataSinks.thrift @@ -128,6 +128,7 @@ struct TResultFileSinkOptions { 16: optional bool delete_existing_files; 17: optional string file_suffix; + 18: optional bool with_bom; } struct TMemoryScratchSink { diff --git a/regression-test/data/export_p0/test_with_bom.out b/regression-test/data/export_p0/test_with_bom.out new file mode 100644 index 00000000000..de995bdce0a --- /dev/null +++ b/regression-test/data/export_p0/test_with_bom.out @@ -0,0 +1,66 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_export1 -- +1 2017-10-01 +10 2017-10-01 +11 2017-10-01 +2 2017-10-01 +3 2017-10-01 +4 2017-10-01 +5 2017-10-01 +6 2017-10-01 +7 2017-10-01 +8 2017-10-01 +9 2017-10-01 + +-- !select_load1 -- +1 2017-10-01 +10 2017-10-01 +11 2017-10-01 +2 2017-10-01 +3 2017-10-01 +4 2017-10-01 +5 2017-10-01 +6 2017-10-01 +7 2017-10-01 +8 2017-10-01 +9 2017-10-01 + +-- !select_load1 -- +1 2017-10-01 +10 2017-10-01 +11 2017-10-01 +2 2017-10-01 +3 2017-10-01 +4 2017-10-01 +5 2017-10-01 +6 2017-10-01 +7 2017-10-01 +8 2017-10-01 +9 2017-10-01 + +-- !select_load1 -- +1 2017-10-01 +10 2017-10-01 +11 2017-10-01 +2 2017-10-01 +3 2017-10-01 +4 2017-10-01 +5 2017-10-01 +6 2017-10-01 +7 2017-10-01 +8 2017-10-01 +9 2017-10-01 + +-- !select_load1 -- +1 2017-10-01 +10 2017-10-01 +11 2017-10-01 +2 2017-10-01 +3 2017-10-01 +4 2017-10-01 +5 2017-10-01 +6 2017-10-01 +7 2017-10-01 +8 2017-10-01 +9 2017-10-01 + diff --git a/regression-test/suites/export_p0/test_with_bom.groovy b/regression-test/suites/export_p0/test_with_bom.groovy new file mode 100644 index 00000000000..ede264f189e --- /dev/null +++ b/regression-test/suites/export_p0/test_with_bom.groovy @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.codehaus.groovy.runtime.IOGroovyMethods + +import java.nio.charset.StandardCharsets +import java.nio.file.Files +import java.nio.file.Paths + +suite("test_with_bom", "p0") { + // open nereids + sql """ set enable_nereids_planner=true """ + sql """ set enable_fallback_to_original_planner=false """ + + + String ak = getS3AK() + String sk = getS3SK() + String s3_endpoint = getS3Endpoint() + String region = getS3Region() + String bucket = context.config.otherConfigs.get("s3BucketName"); + + + def this_db = "regression_test_export_p0" + def table_export_name = "test_with_bom" + def table_load_name = "test_with_bom_load_back" + def outfile_path_prefix = """${bucket}/export/test_with_bom/exp_""" + def file_format = "csv" + + // test csv with bom + sql """ DROP TABLE IF EXISTS ${table_export_name} """ + sql """ + CREATE TABLE IF NOT EXISTS ${table_export_name} ( + `user_id` INT NOT NULL COMMENT "用户id", + `date` DATE NOT NULL COMMENT "数据灌入日期时间" + ) + DISTRIBUTED BY HASH(user_id) + PROPERTIES("replication_num" = "1"); + """ + StringBuilder sb = new StringBuilder() + int i = 1 + for (; i < 11; i ++) { + sb.append(""" + (${i}, '2017-10-01'), + """) + } + sb.append(""" + (${i}, '2017-10-01') + """) + sql """ INSERT INTO ${table_export_name} VALUES + ${sb.toString()} + """ + def insert_res = sql "show last insert;" + logger.info("insert result: " + insert_res.toString()) + order_qt_select_export1 """ SELECT * FROM ${table_export_name} t ORDER BY user_id; """ + + + def waiting_export = { export_label -> + while (true) { + def res = sql """ show export where label = "${export_label}";""" + logger.info("export state: " + res[0][2]) + if (res[0][2] == "FINISHED") { + def json = parseJson(res[0][11]) + assert json instanceof List + assertEquals("1", json.fileNumber[0][0]) + log.info("outfile_path: ${json.url[0][0]}") + return json.url[0][0]; + } else if (res[0][2] == "CANCELLED") { + throw new IllegalStateException("""export failed: ${res[0][10]}""") + } else { + sleep(5000) + } + } + } + + def check_bytes = { except_bytes, export_label -> + def show_export_res = sql_return_maparray """ show export where label = "${export_label}"; """ + def export_job = show_export_res[0] + def outfile_json = parseJson(export_job.OutfileInfo) + assertEquals(except_bytes, outfile_json[0][0].fileSize) + } + + // 1. exec export without bom + def uuid = UUID.randomUUID().toString() + def outFilePath = """${outfile_path_prefix}_${uuid}""" + def label = "label_${uuid}" + try { + // exec export + sql """ + EXPORT TABLE ${table_export_name} TO "s3://${outFilePath}/" + PROPERTIES( + "label" = "${label}", + "format" = "${file_format}" + ) + WITH S3( + "s3.endpoint" = "${s3_endpoint}", + "s3.region" = "${region}", + "s3.secret_key"="${sk}", + "s3.access_key" = "${ak}" + ); + """ + def outfile_url = waiting_export.call(label) + + order_qt_select_load1 """ select * from s3( + "uri" = "http://${s3_endpoint}${outfile_url.substring(4, outfile_url.length() - 1)}0.${file_format}", + "s3.access_key"= "${ak}", + "s3.secret_key" = "${sk}", + "format" = "csv", + "region" = "${region}" + ) ORDER BY c1; + """ + + // check outfile bytes + check_bytes("145bytes", label) + } finally { + } + + + // 2. exec export with bom + uuid = UUID.randomUUID().toString() + outFilePath = """${outfile_path_prefix}_${uuid}""" + label = "label_${uuid}" + try { + // exec export + sql """ + EXPORT TABLE ${table_export_name} TO "s3://${outFilePath}/" + PROPERTIES( + "label" = "${label}", + "format" = "${file_format}", + "with_bom" = "true" + ) + WITH S3( + "s3.endpoint" = "${s3_endpoint}", + "s3.region" = "${region}", + "s3.secret_key"="${sk}", + "s3.access_key" = "${ak}" + ); + """ + def outfile_url = waiting_export.call(label) + + order_qt_select_load1 """ select * from s3( + "uri" = "http://${s3_endpoint}${outfile_url.substring(4, outfile_url.length() - 1)}0.${file_format}", + "s3.access_key"= "${ak}", + "s3.secret_key" = "${sk}", + "format" = "csv", + "region" = "${region}" + ) ORDER BY c1; + """ + + // check outfile bytes + check_bytes("148bytes", label) + } finally { + } + + + // 3. test csv_with_names with bom + uuid = UUID.randomUUID().toString() + outFilePath = """${outfile_path_prefix}_${uuid}""" + label = "label_${uuid}" + try { + // exec export + sql """ + EXPORT TABLE ${table_export_name} TO "s3://${outFilePath}/" + PROPERTIES( + "label" = "${label}", + "format" = "csv_with_names", + "with_bom" = "true" + ) + WITH S3( + "s3.endpoint" = "${s3_endpoint}", + "s3.region" = "${region}", + "s3.secret_key"="${sk}", + "s3.access_key" = "${ak}" + ); + """ + def outfile_url = waiting_export.call(label) + + order_qt_select_load1 """ select * from s3( + "uri" = "http://${s3_endpoint}${outfile_url.substring(4, outfile_url.length() - 1)}0.${file_format}", + "s3.access_key"= "${ak}", + "s3.secret_key" = "${sk}", + "format" = "csv_with_names", + "region" = "${region}" + ) ORDER BY user_id; + """ + + // check outfile bytes + check_bytes("161bytes", label) + } finally { + } + + + // 4. test csv_with_names_and_types with bom + uuid = UUID.randomUUID().toString() + outFilePath = """${outfile_path_prefix}_${uuid}""" + label = "label_${uuid}" + try { + // exec export + sql """ + EXPORT TABLE ${table_export_name} TO "s3://${outFilePath}/" + PROPERTIES( + "label" = "${label}", + "format" = "csv_with_names_and_types", + "with_bom" = "true" + ) + WITH S3( + "s3.endpoint" = "${s3_endpoint}", + "s3.region" = "${region}", + "s3.secret_key"="${sk}", + "s3.access_key" = "${ak}" + ); + """ + def outfile_url = waiting_export.call(label) + + order_qt_select_load1 """ select * from s3( + "uri" = "http://${s3_endpoint}${outfile_url.substring(4, outfile_url.length() - 1)}0.${file_format}", + "s3.access_key"= "${ak}", + "s3.secret_key" = "${sk}", + "format" = "csv_with_names_and_types", + "region" = "${region}" + ) ORDER BY user_id; + """ + + // check outfile bytes + check_bytes("172bytes", label) + } finally { + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org