(doris) branch master updated: [Enhencement](Outfile/Export) Export data to csv file format with BOM (#30533)

morningman Fri, 09 Feb 2024 01:17:22 -0800

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 210b0ecc225 [Enhencement](Outfile/Export) Export data to csv file 
format with BOM (#30533)
210b0ecc225 is described below

commit 210b0ecc225302d2fb299c701f8575deb9c0877f
Author: Tiewei Fang <43782773+bepppo...@users.noreply.github.com>
AuthorDate: Fri Feb 9 17:15:55 2024 +0800

    [Enhencement](Outfile/Export) Export data to csv file format with BOM 
(#30533)
    
    The UTF8 format of the Windows system has BOM.
    
    We add a new user property to `Outfile/Export`。Therefore, when exporting 
Doris data, users can choose whether to bring BOM on the beginning of the CSV 
file.
    
    **Usage:**
    ```sql
    -- outfile:
    select * from demo.student
    into outfile "file:///xxx/export/exp_"
    format as csv
    properties(
        "column_separator" = ",",
        "with_bom" = "true"
    );
    
    -- Export:
    EXPORT TABLE student TO "file:///xx/tmpdata/export/exp_"
    PROPERTIES(
        "format" = "csv",
        "with_bom" = "true"
    );
    ```
---
 be/src/vec/exec/format/csv/csv_reader.cpp          |  35 ++-
 be/src/vec/exec/format/csv/csv_reader.h            |   5 +
 be/src/vec/runtime/vcsv_transformer.cpp            |  11 +-
 be/src/vec/runtime/vcsv_transformer.h              |   5 +-
 be/src/vec/sink/vresult_sink.h                     |   3 +
 be/src/vec/sink/writer/vfile_result_writer.cpp     |   7 +-
 .../Manipulation/EXPORT.md                         |  11 +
 .../Manipulation/EXPORT.md                         |   2 +
 .../java/org/apache/doris/analysis/ExportStmt.java |   5 +
 .../org/apache/doris/analysis/OutFileClause.java   |   8 +
 .../main/java/org/apache/doris/load/ExportJob.java |   4 +
 .../main/java/org/apache/doris/load/ExportMgr.java |   1 +
 .../org/apache/doris/load/ExportTaskExecutor.java  |   6 +-
 .../trees/plans/commands/ExportCommand.java        |   4 +
 gensrc/thrift/DataSinks.thrift                     |   1 +
 regression-test/data/export_p0/test_with_bom.out   |  66 ++++++
 .../suites/export_p0/test_with_bom.groovy          | 241 +++++++++++++++++++++
 17 files changed, 406 insertions(+), 9 deletions(-)

diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 84c8d911f94..86986f8eea5 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -462,13 +462,24 @@ Status CsvReader::get_next_block(Block* block, size_t* 
read_rows, bool* eof) {
     size_t rows = 0;
 
     bool success = false;
+    bool is_remove_bom = false;
     if (_push_down_agg_type == TPushAggOp::type::COUNT) {
         while (rows < batch_size && !_line_reader_eof) {
             const uint8_t* ptr = nullptr;
             size_t size = 0;
             RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, 
&_line_reader_eof, _io_ctx));
+
+            // _skip_lines == 0 means this line is the actual data beginning 
line for the entire file
+            // is_remove_bom means _remove_bom should only execute once
+            if (_skip_lines == 0 && !is_remove_bom) {
+                ptr = _remove_bom(ptr, size);
+                is_remove_bom = true;
+            }
+
+            // _skip_lines > 0 means we do not need to remove bom
             if (_skip_lines > 0) {
                 _skip_lines--;
+                is_remove_bom = true;
                 continue;
             }
             if (size == 0) {
@@ -490,8 +501,18 @@ Status CsvReader::get_next_block(Block* block, size_t* 
read_rows, bool* eof) {
             const uint8_t* ptr = nullptr;
             size_t size = 0;
             RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, 
&_line_reader_eof, _io_ctx));
+
+            // _skip_lines == 0 means this line is the actual data beginning 
line for the entire file
+            // is_remove_bom means _remove_bom should only execute once
+            if (!is_remove_bom && _skip_lines == 0) {
+                ptr = _remove_bom(ptr, size);
+                is_remove_bom = true;
+            }
+
+            // _skip_lines > 0 means we do not remove bom
             if (_skip_lines > 0) {
                 _skip_lines--;
+                is_remove_bom = true;
                 continue;
             }
             if (size == 0) {
@@ -538,10 +559,11 @@ Status 
CsvReader::get_parsed_schema(std::vector<std::string>* col_names,
         } else { // parse csv file with names
             RETURN_IF_ERROR(_parse_col_names(col_names));
         }
+
         for (size_t j = 0; j < col_names->size(); ++j) {
             col_types->emplace_back(TypeDescriptor::create_string_type());
         }
-    } else { // parse csv file without names and types
+    } else { // parse csv file with names and types
         RETURN_IF_ERROR(_parse_col_names(col_names));
         RETURN_IF_ERROR(_parse_col_types(col_names->size(), col_types));
     }
@@ -930,6 +952,7 @@ Status CsvReader::_parse_col_nums(size_t* col_nums) {
     if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), 
size)) {
         return Status::InternalError<false>("Only support csv data in utf8 
codec");
     }
+    ptr = _remove_bom(ptr, size);
     _split_line(Slice(ptr, size));
     *col_nums = _split_values.size();
     return Status::OK();
@@ -946,6 +969,7 @@ Status 
CsvReader::_parse_col_names(std::vector<std::string>* col_names) {
     if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), 
size)) {
         return Status::InternalError<false>("Only support csv data in utf8 
codec");
     }
+    ptr = _remove_bom(ptr, size);
     _split_line(Slice(ptr, size));
     for (size_t idx = 0; idx < _split_values.size(); ++idx) {
         col_names->emplace_back(_split_values[idx].to_string());
@@ -969,6 +993,15 @@ Status CsvReader::_parse_col_types(size_t col_nums, 
std::vector<TypeDescriptor>*
     return Status::OK();
 }
 
+const uint8_t* CsvReader::_remove_bom(const uint8_t* ptr, size_t& size) {
+    if (size >= 3 && ptr[0] == 0xEF && ptr[1] == 0xBB && ptr[2] == 0xBF) {
+        LOG(INFO) << "remove bom";
+        size -= 3;
+        return ptr + 3;
+    }
+    return ptr;
+}
+
 Status CsvReader::close() {
     if (_line_reader) {
         _line_reader->close();
diff --git a/be/src/vec/exec/format/csv/csv_reader.h 
b/be/src/vec/exec/format/csv/csv_reader.h
index 19561b39eeb..d9c8633f427 100644
--- a/be/src/vec/exec/format/csv/csv_reader.h
+++ b/be/src/vec/exec/format/csv/csv_reader.h
@@ -230,6 +230,11 @@ private:
     // and the line is skipped as unqualified row, and the process should 
continue.
     Status _validate_line(const Slice& line, bool* success);
 
+    // If the CSV file is an UTF8 encoding with BOM,
+    // then remove the first 3 bytes at the beginning of this file
+    // and set size = size - 3.
+    const uint8_t* _remove_bom(const uint8_t* ptr, size_t& size);
+
     RuntimeState* _state = nullptr;
     RuntimeProfile* _profile = nullptr;
     ScannerCounter* _counter = nullptr;
diff --git a/be/src/vec/runtime/vcsv_transformer.cpp 
b/be/src/vec/runtime/vcsv_transformer.cpp
index 6628f7743cc..0daba686d62 100644
--- a/be/src/vec/runtime/vcsv_transformer.cpp
+++ b/be/src/vec/runtime/vcsv_transformer.cpp
@@ -54,15 +54,18 @@
 
 namespace doris::vectorized {
 
+static const unsigned char bom[] = {0xEF, 0xBB, 0xBF};
+
 VCSVTransformer::VCSVTransformer(RuntimeState* state, doris::io::FileWriter* 
file_writer,
                                  const VExprContextSPtrs& output_vexpr_ctxs,
                                  bool output_object_data, std::string_view 
header_type,
                                  std::string_view header, std::string_view 
column_separator,
-                                 std::string_view line_delimiter)
+                                 std::string_view line_delimiter, bool 
with_bom)
         : VFileFormatTransformer(state, output_vexpr_ctxs, output_object_data),
           _column_separator(column_separator),
           _line_delimiter(line_delimiter),
-          _file_writer(file_writer) {
+          _file_writer(file_writer),
+          _with_bom(with_bom) {
     if (header.size() > 0) {
         _csv_header = header;
         if (header_type == BeConsts::CSV_WITH_NAMES_AND_TYPES) {
@@ -74,6 +77,10 @@ VCSVTransformer::VCSVTransformer(RuntimeState* state, 
doris::io::FileWriter* fil
 }
 
 Status VCSVTransformer::open() {
+    if (_with_bom) {
+        RETURN_IF_ERROR(
+                _file_writer->append(Slice(reinterpret_cast<const char*>(bom), 
sizeof(bom))));
+    }
     if (!_csv_header.empty()) {
         return _file_writer->append(Slice(_csv_header.data(), 
_csv_header.size()));
     }
diff --git a/be/src/vec/runtime/vcsv_transformer.h 
b/be/src/vec/runtime/vcsv_transformer.h
index 8821b857184..779b8e41377 100644
--- a/be/src/vec/runtime/vcsv_transformer.h
+++ b/be/src/vec/runtime/vcsv_transformer.h
@@ -43,7 +43,8 @@ public:
     VCSVTransformer(RuntimeState* state, doris::io::FileWriter* file_writer,
                     const VExprContextSPtrs& output_vexpr_ctxs, bool 
output_object_data,
                     std::string_view header_type, std::string_view header,
-                    std::string_view column_separator, std::string_view 
line_delimiter);
+                    std::string_view column_separator, std::string_view 
line_delimiter,
+                    bool with_bom);
 
     ~VCSVTransformer() = default;
 
@@ -71,6 +72,8 @@ private:
     // For example: bitmap_to_string() may return large volume of data.
     // And the speed is relative low, in my test, is about 6.5MB/s.
     fmt::memory_buffer _outstream_buffer;
+
+    bool _with_bom = false;
 };
 
 } // namespace doris::vectorized
diff --git a/be/src/vec/sink/vresult_sink.h b/be/src/vec/sink/vresult_sink.h
index 0dd69ee84cd..c5b092bcd41 100644
--- a/be/src/vec/sink/vresult_sink.h
+++ b/be/src/vec/sink/vresult_sink.h
@@ -70,6 +70,8 @@ struct ResultFileOptions {
 
     bool delete_existing_files = false;
     std::string file_suffix;
+    //Bring BOM when exporting to CSV format
+    bool with_bom = false;
 
     ResultFileOptions(const TResultFileSinkOptions& t_opt) {
         file_path = t_opt.file_path;
@@ -81,6 +83,7 @@ struct ResultFileOptions {
         delete_existing_files =
                 t_opt.__isset.delete_existing_files ? 
t_opt.delete_existing_files : false;
         file_suffix = t_opt.file_suffix;
+        with_bom = t_opt.with_bom;
 
         is_local_file = true;
         if (t_opt.__isset.broker_addresses) {
diff --git a/be/src/vec/sink/writer/vfile_result_writer.cpp 
b/be/src/vec/sink/writer/vfile_result_writer.cpp
index f7015e17c96..bf3d09cdda6 100644
--- a/be/src/vec/sink/writer/vfile_result_writer.cpp
+++ b/be/src/vec/sink/writer/vfile_result_writer.cpp
@@ -143,9 +143,10 @@ Status VFileResultWriter::_create_file_writer(const 
std::string& file_name) {
             _file_writer_impl));
     switch (_file_opts->file_format) {
     case TFileFormatType::FORMAT_CSV_PLAIN:
-        _vfile_writer.reset(new VCSVTransformer(
-                _state, _file_writer_impl.get(), _vec_output_expr_ctxs, 
_output_object_data,
-                _header_type, _header, _file_opts->column_separator, 
_file_opts->line_delimiter));
+        _vfile_writer.reset(new VCSVTransformer(_state, 
_file_writer_impl.get(),
+                                                _vec_output_expr_ctxs, 
_output_object_data,
+                                                _header_type, _header, 
_file_opts->column_separator,
+                                                _file_opts->line_delimiter, 
_file_opts->with_bom));
         break;
     case TFileFormatType::FORMAT_PARQUET:
         _vfile_writer.reset(new VParquetTransformer(
diff --git 
a/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md
 
b/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md
index 8eab419de6d..c6479d0e824 100644
--- 
a/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md
+++ 
b/docs/en/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md
@@ -77,14 +77,25 @@ The bottom layer of the `Export` statement actually 
executes the `select...outfi
   The following parameters can be specified:
 
   - `label`: This parameter is optional, specifies the label of the export 
task. If this parameter is not specified, the system randomly assigns a label 
to the export task.
+
   - `column_separator`: Specifies the exported column separator, default is 
`\t`, mulit-bytes is supported. This parameter is only used for `CSV` file 
format.
+
   - `line_delimiter`: Specifies the line delimiter for export, the default is 
`\n`, mulit-bytes is supported. This parameter is only used for `CSV` file 
format.
+
   - `timeout`: The timeout period of the export job, the default is 2 hours, 
the unit is seconds.
+
   - `columns`: Specifies certain columns of the export job table
+
   - `format`: Specifies the file format, support: parquet, orc, csv, 
csv_with_names, csv_with_names_and_types.The default is csv format.
+
   - `parallelism`: The concurrency degree of the `export` job, the default is 
`1`. The export job will be divided into `select..outfile..` statements of the 
number of `parallelism` to execute concurrently. (If the value of `parallelism` 
is greater than the number of tablets in the table, the system will 
automatically set `parallelism` to the number of tablets, that is, each 
`select..outfile..` statement is responsible for one tablet)
+
   - `delete_existing_files`: default `false`. If it is specified as true, you 
will first delete all files specified in the directory specified by the 
file_path, and then export the data to the directory.For example: "file_path" = 
"/user/tmp", then delete all files and directory under "/user/"; "file_path" = 
"/user/tmp/", then delete all files and directory under "/user/tmp/"
+
   - `max_file_size`: it is the limit for the size of a single file in the 
export job. If the result file exceeds this value, it will be split into 
multiple files. The valid range for `max_file_size` is [5MB, 2GB], with a 
default value of 1GB. (When exporting to the ORC file format, the actual size 
of the split files will be multiples of 64MB, for example, if max_file_size is 
specified as 5MB, the actual split size will be 64MB; if max_file_size is 
specified as 65MB, the actual split size [...]
+  
+  - `with_bom`: The default is false. If it is set to true, the exported file 
is encoded in UTF8 with BOM (valid only for CSV-related file format).
+
   - `timeout`: This is the timeout parameter of the export job, the default 
timeout is 2 hours, and the unit is seconds.
 
   > Note that to use the `delete_existing_files` parameter, you also need to 
add the configuration `enable_delete_existing_files = true` to the fe.conf file 
and restart the FE. Only then will the `delete_existing_files` parameter take 
effect. Setting `delete_existing_files = true` is a dangerous operation and it 
is recommended to only use it in a testing environment.
diff --git 
a/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md
 
b/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md
index ca04b4c8a42..28021c8d7f0 100644
--- 
a/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md
+++ 
b/docs/zh-CN/docs/sql-manual/sql-reference/Data-Manipulation-Statements/Manipulation/EXPORT.md
@@ -90,6 +90,8 @@ EXPORT
 
   - `delete_existing_files`: 
默认为false，若指定为true,则会先删除`export_path`所指定目录下的所有文件，然后导出数据到该目录下。例如："export_path" = 
"/user/tmp", 则会删除"/user/"下所有文件及目录；"file_path" = "/user/tmp/", 
则会删除"/user/tmp/"下所有文件及目录。
 
+  - `with_bom`: 默认为false，若指定为true，则导出的文件编码为带有BOM的UTF8编码（只对csv相关的文件格式生效）。
+
   - `timeout`：导出作业的超时时间，默认为2小时，单位是秒。
 
   > 
注意：要使用delete_existing_files参数，还需要在fe.conf中添加配置`enable_delete_existing_files = 
true`并重启fe，此时delete_existing_files才会生效。delete_existing_files = true 
是一个危险的操作，建议只在测试环境中使用。
diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java
index 1b33630e802..123bfa727fb 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ExportStmt.java
@@ -103,6 +103,7 @@ public class ExportStmt extends StatementBase {
 
     private String maxFileSize;
     private String deleteExistingFiles;
+    private String withBom;
     private SessionVariable sessionVariables;
 
     private String qualifiedUser;
@@ -228,6 +229,7 @@ public class ExportStmt extends StatementBase {
         exportJob.setParallelism(this.parallelism);
         exportJob.setMaxFileSize(this.maxFileSize);
         exportJob.setDeleteExistingFiles(this.deleteExistingFiles);
+        exportJob.setWithBom(this.withBom);
 
         if (columns != null) {
             Splitter split = Splitter.on(',').trimResults().omitEmptyStrings();
@@ -354,6 +356,9 @@ public class ExportStmt extends StatementBase {
             // generate a random label
             this.label = "export_" + UUID.randomUUID();
         }
+
+        // with bom
+        this.withBom = properties.getOrDefault(OutFileClause.PROP_WITH_BOM, 
"false");
     }
 
     private void checkColumns() throws DdlException {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
index 236788ce328..945a8b07452 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/OutFileClause.java
@@ -136,6 +136,7 @@ public class OutFileClause {
     private static final String PROP_SUCCESS_FILE_NAME = "success_file_name";
     public static final String PROP_DELETE_EXISTING_FILES = 
"delete_existing_files";
     public static final String PROP_FILE_SUFFIX = "file_suffix";
+    public static final String PROP_WITH_BOM = "with_bom";
 
     private static final String PARQUET_PROP_PREFIX = "parquet.";
     private static final String SCHEMA = "schema";
@@ -155,6 +156,7 @@ public class OutFileClause {
     private long maxFileSizeBytes = DEFAULT_MAX_FILE_SIZE_BYTES;
     private boolean deleteExistingFiles = false;
     private String fileSuffix = "";
+    private boolean withBom = false;
     private BrokerDesc brokerDesc = null;
     // True if result is written to local disk.
     // If set to true, the brokerDesc must be null.
@@ -566,6 +568,11 @@ public class OutFileClause {
             processedPropKeys.add(PROP_FILE_SUFFIX);
         }
 
+        if (properties.containsKey(PROP_WITH_BOM)) {
+            withBom = 
Boolean.valueOf(properties.get(PROP_WITH_BOM)).booleanValue();
+            processedPropKeys.add(PROP_WITH_BOM);
+        }
+
         if (properties.containsKey(PROP_SUCCESS_FILE_NAME)) {
             successFileName = properties.get(PROP_SUCCESS_FILE_NAME);
             FeNameFormat.checkOutfileSuccessFileName("file name", 
successFileName);
@@ -805,6 +812,7 @@ public class OutFileClause {
         sinkOptions.setMaxFileSizeBytes(maxFileSizeBytes);
         sinkOptions.setDeleteExistingFiles(deleteExistingFiles);
         sinkOptions.setFileSuffix(fileSuffix);
+        sinkOptions.setWithBom(withBom);
 
         if (brokerDesc != null) {
             sinkOptions.setBrokerProperties(brokerDesc.getProperties());
diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java 
b/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java
index 0ae513d1db6..f6da9a6a97a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/load/ExportJob.java
@@ -166,6 +166,8 @@ public class ExportJob implements Writable {
 
     @SerializedName("tabletsNum")
     private Integer tabletsNum;
+    @SerializedName("withBom")
+    private String withBom;
 
     private TableRef tableRef;
 
@@ -219,6 +221,7 @@ public class ExportJob implements Writable {
         this.columnSeparator = "\t";
         this.lineDelimiter = "\n";
         this.columns = "";
+        this.withBom = "false";
     }
 
     public ExportJob(long jobId) {
@@ -554,6 +557,7 @@ public class ExportJob implements Writable {
         if (!deleteExistingFiles.isEmpty()) {
             outfileProperties.put(OutFileClause.PROP_DELETE_EXISTING_FILES, 
deleteExistingFiles);
         }
+        outfileProperties.put(OutFileClause.PROP_WITH_BOM, withBom);
 
         // broker properties
         // outfile clause's broker properties need 'broker.' prefix
diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java 
b/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java
index 90be7f9f10e..efbfd33966f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/load/ExportMgr.java
@@ -349,6 +349,7 @@ public class ExportMgr {
         infoMap.put("broker", job.getBrokerDesc().getName());
         infoMap.put("column_separator", job.getColumnSeparator());
         infoMap.put("format", job.getFormat());
+        infoMap.put("with_bom", job.getWithBom());
         infoMap.put("line_delimiter", job.getLineDelimiter());
         infoMap.put("columns", job.getColumns());
         infoMap.put("tablet_num", job.getTabletsNum());
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/load/ExportTaskExecutor.java 
b/fe/fe-core/src/main/java/org/apache/doris/load/ExportTaskExecutor.java
index 26e873afd80..3f7c17fef68 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/load/ExportTaskExecutor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/load/ExportTaskExecutor.java
@@ -40,7 +40,6 @@ import 
org.apache.doris.scheduler.executor.TransientTaskExecutor;
 import org.apache.doris.thrift.TUniqueId;
 
 import com.google.common.collect.Lists;
-import lombok.Setter;
 import lombok.extern.slf4j.Slf4j;
 
 import java.util.List;
@@ -56,7 +55,6 @@ public class ExportTaskExecutor implements 
TransientTaskExecutor {
 
     ExportJob exportJob;
 
-    @Setter
     Long taskId;
 
     private StmtExecutor stmtExecutor;
@@ -205,4 +203,8 @@ public class ExportTaskExecutor implements 
TransientTaskExecutor {
         }
         return Optional.empty();
     }
+
+    public void setTaskId(Long taskId) {
+        this.taskId = taskId;
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExportCommand.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExportCommand.java
index 6d4fb22282c..b43fc9348ce 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExportCommand.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ExportCommand.java
@@ -88,6 +88,7 @@ public class ExportCommand extends Command implements 
ForwardWithSync {
             .add(PropertyAnalyzer.PROPERTIES_LINE_DELIMITER)
             .add(PropertyAnalyzer.PROPERTIES_TIMEOUT)
             .add("format")
+            .add(OutFileClause.PROP_WITH_BOM)
             .build();
 
     private final List<String> nameParts;
@@ -267,6 +268,9 @@ public class ExportCommand extends Command implements 
ForwardWithSync {
         
exportJob.setFormat(fileProperties.getOrDefault(LoadStmt.KEY_IN_PARAM_FORMAT_TYPE,
 "csv")
                 .toLowerCase());
 
+        // set withBom
+        
exportJob.setWithBom(fileProperties.getOrDefault(OutFileClause.PROP_WITH_BOM, 
"false"));
+
         // set parallelism
         int parallelism;
         try {
diff --git a/gensrc/thrift/DataSinks.thrift b/gensrc/thrift/DataSinks.thrift
index f9e26e7a47c..068b5927004 100644
--- a/gensrc/thrift/DataSinks.thrift
+++ b/gensrc/thrift/DataSinks.thrift
@@ -128,6 +128,7 @@ struct TResultFileSinkOptions {
 
     16: optional bool delete_existing_files;
     17: optional string file_suffix;
+    18: optional bool with_bom;
 }
 
 struct TMemoryScratchSink {
diff --git a/regression-test/data/export_p0/test_with_bom.out 
b/regression-test/data/export_p0/test_with_bom.out
new file mode 100644
index 00000000000..de995bdce0a
--- /dev/null
+++ b/regression-test/data/export_p0/test_with_bom.out
@@ -0,0 +1,66 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !select_export1 --
+1      2017-10-01
+10     2017-10-01
+11     2017-10-01
+2      2017-10-01
+3      2017-10-01
+4      2017-10-01
+5      2017-10-01
+6      2017-10-01
+7      2017-10-01
+8      2017-10-01
+9      2017-10-01
+
+-- !select_load1 --
+1      2017-10-01
+10     2017-10-01
+11     2017-10-01
+2      2017-10-01
+3      2017-10-01
+4      2017-10-01
+5      2017-10-01
+6      2017-10-01
+7      2017-10-01
+8      2017-10-01
+9      2017-10-01
+
+-- !select_load1 --
+1      2017-10-01
+10     2017-10-01
+11     2017-10-01
+2      2017-10-01
+3      2017-10-01
+4      2017-10-01
+5      2017-10-01
+6      2017-10-01
+7      2017-10-01
+8      2017-10-01
+9      2017-10-01
+
+-- !select_load1 --
+1      2017-10-01
+10     2017-10-01
+11     2017-10-01
+2      2017-10-01
+3      2017-10-01
+4      2017-10-01
+5      2017-10-01
+6      2017-10-01
+7      2017-10-01
+8      2017-10-01
+9      2017-10-01
+
+-- !select_load1 --
+1      2017-10-01
+10     2017-10-01
+11     2017-10-01
+2      2017-10-01
+3      2017-10-01
+4      2017-10-01
+5      2017-10-01
+6      2017-10-01
+7      2017-10-01
+8      2017-10-01
+9      2017-10-01
+
diff --git a/regression-test/suites/export_p0/test_with_bom.groovy 
b/regression-test/suites/export_p0/test_with_bom.groovy
new file mode 100644
index 00000000000..ede264f189e
--- /dev/null
+++ b/regression-test/suites/export_p0/test_with_bom.groovy
@@ -0,0 +1,241 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.codehaus.groovy.runtime.IOGroovyMethods
+
+import java.nio.charset.StandardCharsets
+import java.nio.file.Files
+import java.nio.file.Paths
+
+suite("test_with_bom", "p0") {
+    // open nereids
+    sql """ set enable_nereids_planner=true """
+    sql """ set enable_fallback_to_original_planner=false """
+
+
+    String ak = getS3AK()
+    String sk = getS3SK()
+    String s3_endpoint = getS3Endpoint()
+    String region = getS3Region()
+    String bucket = context.config.otherConfigs.get("s3BucketName");
+
+
+    def this_db = "regression_test_export_p0"
+    def table_export_name = "test_with_bom"
+    def table_load_name = "test_with_bom_load_back"
+    def outfile_path_prefix = """${bucket}/export/test_with_bom/exp_"""
+    def file_format = "csv"
+
+    // test csv with bom
+    sql """ DROP TABLE IF EXISTS ${table_export_name} """
+    sql """
+        CREATE TABLE IF NOT EXISTS ${table_export_name} (
+            `user_id` INT NOT NULL COMMENT "用户id",
+            `date` DATE NOT NULL COMMENT "数据灌入日期时间"
+            )
+            DISTRIBUTED BY HASH(user_id)
+            PROPERTIES("replication_num" = "1");
+        """
+    StringBuilder sb = new StringBuilder()
+    int i = 1
+    for (; i < 11; i ++) {
+        sb.append("""
+            (${i}, '2017-10-01'),
+        """)
+    }
+    sb.append("""
+            (${i}, '2017-10-01')
+        """)
+    sql """ INSERT INTO ${table_export_name} VALUES
+            ${sb.toString()}
+        """
+    def insert_res = sql "show last insert;"
+    logger.info("insert result: " + insert_res.toString())
+    order_qt_select_export1 """ SELECT * FROM ${table_export_name} t ORDER BY 
user_id; """
+
+
+    def waiting_export = { export_label ->
+        while (true) {
+            def res = sql """ show export where label = "${export_label}";"""
+            logger.info("export state: " + res[0][2])
+            if (res[0][2] == "FINISHED") {
+                def json = parseJson(res[0][11])
+                assert json instanceof List
+                assertEquals("1", json.fileNumber[0][0])
+                log.info("outfile_path: ${json.url[0][0]}")
+                return json.url[0][0];
+            } else if (res[0][2] == "CANCELLED") {
+                throw new IllegalStateException("""export failed: 
${res[0][10]}""")
+            } else {
+                sleep(5000)
+            }
+        }
+    }
+
+    def check_bytes = { except_bytes, export_label -> 
+        def show_export_res = sql_return_maparray """ show export where label 
= "${export_label}"; """
+        def export_job = show_export_res[0]
+        def outfile_json = parseJson(export_job.OutfileInfo)
+        assertEquals(except_bytes, outfile_json[0][0].fileSize)
+    }
+
+    // 1. exec export without bom
+    def uuid = UUID.randomUUID().toString()
+    def outFilePath = """${outfile_path_prefix}_${uuid}"""
+    def label = "label_${uuid}"
+    try {
+        // exec export
+        sql """
+            EXPORT TABLE ${table_export_name} TO "s3://${outFilePath}/"
+            PROPERTIES(
+                "label" = "${label}",
+                "format" = "${file_format}"
+            )
+            WITH S3(
+                "s3.endpoint" = "${s3_endpoint}",
+                "s3.region" = "${region}",
+                "s3.secret_key"="${sk}",
+                "s3.access_key" = "${ak}"
+            );
+        """
+        def outfile_url = waiting_export.call(label)
+        
+        order_qt_select_load1 """ select * from s3(
+                                    "uri" = 
"http://${s3_endpoint}${outfile_url.substring(4, outfile_url.length() - 
1)}0.${file_format}",
+                                    "s3.access_key"= "${ak}",
+                                    "s3.secret_key" = "${sk}",
+                                    "format" = "csv",
+                                    "region" = "${region}"
+                                ) ORDER BY c1;
+                                """
+
+        // check outfile bytes
+        check_bytes("145bytes", label)
+    } finally {
+    }
+
+
+    // 2. exec export with bom
+    uuid = UUID.randomUUID().toString()
+    outFilePath = """${outfile_path_prefix}_${uuid}"""
+    label = "label_${uuid}"
+    try {
+        // exec export
+        sql """
+            EXPORT TABLE ${table_export_name} TO "s3://${outFilePath}/"
+            PROPERTIES(
+                "label" = "${label}",
+                "format" = "${file_format}",
+                "with_bom" = "true"
+            )
+            WITH S3(
+                "s3.endpoint" = "${s3_endpoint}",
+                "s3.region" = "${region}",
+                "s3.secret_key"="${sk}",
+                "s3.access_key" = "${ak}"
+            );
+        """
+        def outfile_url = waiting_export.call(label)
+        
+        order_qt_select_load1 """ select * from s3(
+                                    "uri" = 
"http://${s3_endpoint}${outfile_url.substring(4, outfile_url.length() - 
1)}0.${file_format}",
+                                    "s3.access_key"= "${ak}",
+                                    "s3.secret_key" = "${sk}",
+                                    "format" = "csv",
+                                    "region" = "${region}"
+                                ) ORDER BY c1;
+                                """
+
+        // check outfile bytes
+        check_bytes("148bytes", label)
+    } finally {
+    }
+
+
+    // 3. test csv_with_names with bom
+    uuid = UUID.randomUUID().toString()
+    outFilePath = """${outfile_path_prefix}_${uuid}"""
+    label = "label_${uuid}"
+    try {
+        // exec export
+        sql """
+            EXPORT TABLE ${table_export_name} TO "s3://${outFilePath}/"
+            PROPERTIES(
+                "label" = "${label}",
+                "format" = "csv_with_names",
+                "with_bom" = "true"
+            )
+            WITH S3(
+                "s3.endpoint" = "${s3_endpoint}",
+                "s3.region" = "${region}",
+                "s3.secret_key"="${sk}",
+                "s3.access_key" = "${ak}"
+            );
+        """
+        def outfile_url = waiting_export.call(label)
+        
+        order_qt_select_load1 """ select * from s3(
+                                    "uri" = 
"http://${s3_endpoint}${outfile_url.substring(4, outfile_url.length() - 
1)}0.${file_format}",
+                                    "s3.access_key"= "${ak}",
+                                    "s3.secret_key" = "${sk}",
+                                    "format" = "csv_with_names",
+                                    "region" = "${region}"
+                                ) ORDER BY user_id;
+                                """
+
+        // check outfile bytes
+        check_bytes("161bytes", label)
+    } finally {
+    }
+
+
+    // 4. test csv_with_names_and_types with bom
+    uuid = UUID.randomUUID().toString()
+    outFilePath = """${outfile_path_prefix}_${uuid}"""
+    label = "label_${uuid}"
+    try {
+        // exec export
+        sql """
+            EXPORT TABLE ${table_export_name} TO "s3://${outFilePath}/"
+            PROPERTIES(
+                "label" = "${label}",
+                "format" = "csv_with_names_and_types",
+                "with_bom" = "true"
+            )
+            WITH S3(
+                "s3.endpoint" = "${s3_endpoint}",
+                "s3.region" = "${region}",
+                "s3.secret_key"="${sk}",
+                "s3.access_key" = "${ak}"
+            );
+        """
+        def outfile_url = waiting_export.call(label)
+        
+        order_qt_select_load1 """ select * from s3(
+                                    "uri" = 
"http://${s3_endpoint}${outfile_url.substring(4, outfile_url.length() - 
1)}0.${file_format}",
+                                    "s3.access_key"= "${ak}",
+                                    "s3.secret_key" = "${sk}",
+                                    "format" = "csv_with_names_and_types",
+                                    "region" = "${region}"
+                                ) ORDER BY user_id;
+                                """
+
+        // check outfile bytes
+        check_bytes("172bytes", label)
+    } finally {
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) branch master updated: [Enhencement](Outfile/Export) Export data to csv file format with BOM (#30533)

Reply via email to