This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new e76c6129e5d branch-3.0: [fix](tvf) Tvf supports to parse the enclose 
character in csv files #45407 (#45568)
e76c6129e5d is described below

commit e76c6129e5d84baa5c082431b57f7002ffc662fa
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri Dec 20 11:55:37 2024 +0800

    branch-3.0: [fix](tvf) Tvf supports to parse the enclose character in csv 
files #45407 (#45568)
    
    Cherry-picked from #45407
    
    Co-authored-by: Tiewei Fang <fangtie...@selectdb.com>
---
 be/src/vec/exec/format/csv/csv_reader.cpp          |  6 +-
 .../doris/common/util/FileFormatConstants.java     |  1 +
 .../ExternalFileTableValuedFunction.java           | 15 +++++
 .../data/external_table_p0/tvf/enclose_csv.csv     |  6 ++
 .../tvf/test_local_tvf_enclose.out                 | 15 +++++
 .../tvf/test_local_tvf_enclose.groovy              | 72 ++++++++++++++++++++++
 6 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index bf0e543d650..b27bb050dc6 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -928,9 +928,13 @@ Status CsvReader::_prepare_parse(size_t* read_line, bool* 
is_parse_name) {
                 _trim_tailing_spaces, _trim_double_quotes, _value_separator,
                 _value_separator_length);
     } else {
+        // If we pass `_file_slot_descs.size() - 1` to 
EncloseCsvTextFieldSplitter, it will cause BE core dump
+        // because currently _file_slot_descs is an empty vector.
+        // The _file_slot_descs.size() is only used to reserve space,
+        // so it's ok to pass 0 to EncloseCsvLineReaderContext
         text_line_reader_ctx = std::make_shared<EncloseCsvLineReaderContext>(
                 _line_delimiter, _line_delimiter_length, _value_separator, 
_value_separator_length,
-                _file_slot_descs.size() - 1, _enclose, _escape, _keep_cr);
+                0, _enclose, _escape, _keep_cr);
         _fields_splitter = std::make_unique<EncloseCsvTextFieldSplitter>(
                 _trim_tailing_spaces, false,
                 
std::static_pointer_cast<EncloseCsvLineReaderContext>(text_line_reader_ctx),
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java
 
b/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java
index bdb2e97b9f2..7050bec9d77 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/common/util/FileFormatConstants.java
@@ -50,6 +50,7 @@ public class FileFormatConstants {
     public static final String PROP_COMPRESS = "compress";
     public static final String PROP_COMPRESS_TYPE = "compress_type";
     public static final String PROP_PATH_PARTITION_KEYS = 
"path_partition_keys";
+    public static final String PROP_ENCLOSE = "enclose";
 
     // decimal(p,s)
     public static final Pattern DECIMAL_TYPE_PATTERN = 
Pattern.compile("decimal\\((\\d+),(\\d+)\\)");
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
 
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
index 6f45a1cc0eb..55d046c2ed9 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
@@ -120,6 +120,7 @@ public abstract class ExternalFileTableValuedFunction 
extends TableValuedFunctio
     private TTextSerdeType textSerdeType = TTextSerdeType.JSON_TEXT_SERDE;
     private String columnSeparator = 
FileFormatConstants.DEFAULT_COLUMN_SEPARATOR;
     private String lineDelimiter = FileFormatConstants.DEFAULT_LINE_DELIMITER;
+    private byte enclose = 0;
     private String jsonRoot = "";
     private String jsonPaths = "";
     private boolean stripOuterArray;
@@ -231,6 +232,17 @@ public abstract class ExternalFileTableValuedFunction 
extends TableValuedFunctio
         }
         lineDelimiter = Separator.convertSeparator(lineDelimiter);
 
+        String enclosedString = getOrDefaultAndRemove(copiedProps, 
FileFormatConstants.PROP_ENCLOSE, "");
+        if (!Strings.isNullOrEmpty(enclosedString)) {
+            if (enclosedString.length() > 1) {
+                throw new AnalysisException("enclose should not be longer than 
one byte.");
+            }
+            enclose = (byte) enclosedString.charAt(0);
+            if (enclose == 0) {
+                throw new AnalysisException("enclose should not be byte [0].");
+            }
+        }
+
         jsonRoot = getOrDefaultAndRemove(copiedProps, 
FileFormatConstants.PROP_JSON_ROOT, "");
         jsonPaths = getOrDefaultAndRemove(copiedProps, 
FileFormatConstants.PROP_JSON_PATHS, "");
         readJsonByLine = Boolean.valueOf(
@@ -285,6 +297,9 @@ public abstract class ExternalFileTableValuedFunction 
extends TableValuedFunctio
         TFileTextScanRangeParams fileTextScanRangeParams = new 
TFileTextScanRangeParams();
         fileTextScanRangeParams.setColumnSeparator(this.columnSeparator);
         fileTextScanRangeParams.setLineDelimiter(this.lineDelimiter);
+        if (enclose != 0) {
+            fileTextScanRangeParams.setEnclose(enclose);
+        }
         fileAttributes.setTextParams(fileTextScanRangeParams);
         if (this.fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN) {
             fileAttributes.setHeaderType(this.headerType);
diff --git a/regression-test/data/external_table_p0/tvf/enclose_csv.csv 
b/regression-test/data/external_table_p0/tvf/enclose_csv.csv
new file mode 100644
index 00000000000..ca787200ff1
--- /dev/null
+++ b/regression-test/data/external_table_p0/tvf/enclose_csv.csv
@@ -0,0 +1,6 @@
+id, field1, field2
+"1", "hello", "same, field"
+"2", "doris", "same, field2"
+"3", "nereids", "same, field3"
+"4", "pipeline", "same, field4"
+"5", "storage", "same, field5"
\ No newline at end of file
diff --git 
a/regression-test/data/external_table_p0/tvf/test_local_tvf_enclose.out 
b/regression-test/data/external_table_p0/tvf/test_local_tvf_enclose.out
new file mode 100644
index 00000000000..6e5d10e4858
--- /dev/null
+++ b/regression-test/data/external_table_p0/tvf/test_local_tvf_enclose.out
@@ -0,0 +1,15 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !enclose_1 --
+"1"    "hello" "same, field"
+"2"    "doris" "same, field2"
+"3"    "nereids"       "same, field3"
+"4"    "pipeline"      "same, field4"
+"5"    "storage"       "same, field5"
+
+-- !enclose_2 --
+1      hello   same, field
+2      doris   same, field2
+3      nereids same, field3
+4      pipeline        same, field4
+5      storage same, field5
+
diff --git 
a/regression-test/suites/external_table_p0/tvf/test_local_tvf_enclose.groovy 
b/regression-test/suites/external_table_p0/tvf/test_local_tvf_enclose.groovy
new file mode 100644
index 00000000000..e7437cd20ec
--- /dev/null
+++ b/regression-test/suites/external_table_p0/tvf/test_local_tvf_enclose.groovy
@@ -0,0 +1,72 @@
+import org.junit.Assert
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_local_tvf_enclose", "p0,tvf") {
+    List<List<Object>> backends =  sql """ show backends """
+    assertTrue(backends.size() > 0)
+    def be_id = backends[0][0]
+
+    String filename = "enclose_csv.csv"
+
+    def dataFilePath = context.config.dataPath + 
"/external_table_p0/tvf/${filename}"
+
+    def outFilePath="/"
+
+    for (List<Object> backend : backends) {
+         def be_host = backend[1]
+         scpFiles ("root", be_host, dataFilePath, outFilePath, false);
+    }
+
+
+    sql """set enable_nereids_planner=true"""
+    sql """set enable_fallback_to_original_planner=false"""
+
+    qt_enclose_1 """
+                    select * from local(
+                        "file_path" = "${filename}",
+                        "backend_id" = "${be_id}",
+                        "format" = "csv_with_names",
+                        "column_separator" = ", ",
+                        "enclose" = "\\\"") order by id;            
+                """
+
+    qt_enclose_2 """
+                    select * from local(
+                        "file_path" = "${filename}",
+                        "backend_id" = "${be_id}",
+                        "format" = "csv_with_names",
+                        "column_separator" = ", ",
+                        "enclose" = "\\\"",
+                        "trim_double_quotes" = "true") order by id;            
+                """
+
+    // test error case
+    test {
+        sql """
+                select * from local(
+                        "file_path" = "${filename}",
+                        "backend_id" = "${be_id}",
+                        "format" = "csv_with_names",
+                        "column_separator" = ", ",
+                        "enclose" = "\\\"\\\"") order by id;            
+                """
+        // check exception message contains
+        exception "enclose should not be longer than one byte."
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to