(doris) branch master updated: [fix](csv) escape quote with double quote for csv format table (#50101)

morningman Wed, 04 Jun 2025 01:51:09 -0700

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 084c8023ada [fix](csv) escape quote with double quote for csv format 
table (#50101)
084c8023ada is described below

commit 084c8023ada3fe98cf19f347d36b1dbcb3a8fbb8
Author: Socrates <[email protected]>
AuthorDate: Wed Jun 4 15:33:00 2025 +0800

    [fix](csv) escape quote with double quote for csv format table (#50101)
    
    ### What problem does this PR solve?
    
    Problem Summary:
    According to the CSV standard format definition, quote characters inside
    a string should be escaped using a pair of quote characters. However,
    the current implementation does not handle this case correctly, which
    may lead to incorrect parsing results when the input string contains
    quote characters.
---
 be/src/vec/data_types/serde/data_type_serde.h      |  10 ++
 .../vec/data_types/serde/data_type_string_serde.h  |  34 +++++
 be/src/vec/exec/format/csv/csv_reader.cpp          |   7 +-
 .../file_reader/new_plain_text_line_reader.cpp     |  19 ++-
 .../file_reader/new_plain_text_line_reader.h       |   3 +
 be/test/vec/data_types/data_type_string_test.cpp   |  60 ++++++++
 .../new_plain_text_line_reader_test.cpp            | 168 +++++++++++++++++++++
 .../scripts/create_preinstalled_scripts/run76.hql  |  10 ++
 .../csv_json_table_simple.csv                      |  10 ++
 .../hive/test_hive_serde_prop.out                  | Bin 1538 -> 1534 bytes
 .../external_table_p0/hive/test_open_csv_serde.out | Bin 0 -> 1607 bytes
 .../test_csv_with_enclose_and_escapeS3_load.out    | Bin 743 -> 743 bytes
 .../enclose_with_escape_doublequote.csv            |   1 +
 .../stream_load/enclose_with_escape_quote.csv      |   1 +
 .../test_csv_with_enclose_and_escape.out           | Bin 709 -> 863 bytes
 .../hive/test_open_csv_serde.groovy                |  45 ++++++
 .../test_csv_with_enclose_and_escape.groovy        |   3 +-
 17 files changed, 363 insertions(+), 8 deletions(-)

diff --git a/be/src/vec/data_types/serde/data_type_serde.h 
b/be/src/vec/data_types/serde/data_type_serde.h
index cc9baec4f1e..ae3c0426c70 100644
--- a/be/src/vec/data_types/serde/data_type_serde.h
+++ b/be/src/vec/data_types/serde/data_type_serde.h
@@ -151,6 +151,8 @@ public:
          */
         bool converted_from_string = false;
 
+        char quote_char = '"';
+
         char escape_char = 0;
         /**
          * flags for each byte to indicate if escape is needed.
@@ -267,6 +269,14 @@ public:
 
     virtual Status deserialize_one_cell_from_json(IColumn& column, Slice& 
slice,
                                                   const FormatOptions& 
options) const = 0;
+
+    // In some cases, CSV and JSON deserialization behaviors may differ
+    // so we provide a default implementation that uses JSON deserialization
+    virtual Status deserialize_one_cell_from_csv(IColumn& column, Slice& slice,
+                                                 const FormatOptions& options) 
const {
+        return deserialize_one_cell_from_json(column, slice, options);
+    }
+
     // deserialize text vector is to avoid virtual function call in complex 
type nested loop
     virtual Status deserialize_column_from_json_vector(IColumn& column, 
std::vector<Slice>& slices,
                                                        uint64_t* 
num_deserialized,
diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h 
b/be/src/vec/data_types/serde/data_type_string_serde.h
index 09d6762e43c..b1b76ed0e87 100644
--- a/be/src/vec/data_types/serde/data_type_string_serde.h
+++ b/be/src/vec/data_types/serde/data_type_string_serde.h
@@ -64,6 +64,31 @@ inline void escape_string(const char* src, size_t& len, char 
escape_char) {
     len = dest_ptr - start;
 }
 
+// specially escape quote with double quote
+inline void escape_string_for_csv(const char* src, size_t& len, char 
escape_char, char quote_char) {
+    const char* start = src;
+    char* dest_ptr = const_cast<char*>(src);
+    const char* end = src + len;
+    bool escape_next_char = false;
+
+    while (src < end) {
+        if ((src < end - 1 && *src == quote_char && *(src + 1) == quote_char) 
||
+            *src == escape_char) {
+            escape_next_char = !escape_next_char;
+        } else {
+            escape_next_char = false;
+        }
+
+        if (escape_next_char) {
+            ++src;
+        } else {
+            *dest_ptr++ = *src++;
+        }
+    }
+
+    len = dest_ptr - start;
+}
+
 template <typename ColumnType>
 class DataTypeStringSerDeBase : public DataTypeSerDe {
     using ColumnStrType = ColumnType;
@@ -189,6 +214,15 @@ public:
         return Status::OK();
     }
 
+    Status deserialize_one_cell_from_csv(IColumn& column, Slice& slice,
+                                         const FormatOptions& options) const 
override {
+        if (options.escape_char != 0) {
+            escape_string_for_csv(slice.data, slice.size, options.escape_char, 
options.quote_char);
+        }
+        assert_cast<ColumnType&>(column).insert_data(slice.data, slice.size);
+        return Status::OK();
+    }
+
     Status deserialize_one_cell_from_hive_text(
             IColumn& column, Slice& slice, const FormatOptions& options,
             int hive_text_complex_type_delimiter_level = 1) const override {
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 5af50597ce3..6a25d6f3687 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -337,6 +337,7 @@ Status CsvReader::init_reader(bool is_load) {
             (_state != nullptr && 
_state->trim_tailing_spaces_for_external_table_query());
 
     _options.escape_char = _escape;
+    _options.quote_char = _enclose;
     if (_params.file_attributes.text_params.collection_delimiter.empty()) {
         switch (_text_serde_type) {
         case TTextSerdeType::JSON_TEXT_SERDE:
@@ -623,8 +624,8 @@ Status CsvReader::deserialize_nullable_string(IColumn& 
column, Slice& slice) {
         }
     }
     static DataTypeStringSerDe stringSerDe;
-    auto st = 
stringSerDe.deserialize_one_cell_from_json(null_column.get_nested_column(), 
slice,
-                                                         _options);
+    auto st = 
stringSerDe.deserialize_one_cell_from_csv(null_column.get_nested_column(), 
slice,
+                                                        _options);
     if (!st.ok()) {
         // fill null if fail
         null_column.insert_data(nullptr, 0); // 0 is meaningless here
@@ -677,7 +678,7 @@ Status CsvReader::_fill_dest_columns(const Slice& line, 
Block* block,
             switch (_text_serde_type) {
             case TTextSerdeType::JSON_TEXT_SERDE:
                 RETURN_IF_ERROR(
-                        _serdes[i]->deserialize_one_cell_from_json(*col_ptr, 
slice, _options));
+                        _serdes[i]->deserialize_one_cell_from_csv(*col_ptr, 
slice, _options));
                 break;
             case TTextSerdeType::HIVE_TEXT_SERDE:
                 RETURN_IF_ERROR(
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp 
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
index 640f70f134c..ec1c5b8d299 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
@@ -149,11 +149,22 @@ void 
EncloseCsvLineReaderContext::_on_pre_match_enclose(const uint8_t* start, si
                 _should_escape = !_should_escape;
             } else if (_should_escape) [[unlikely]] {
                 _should_escape = false;
-            } else if (start[_idx] == _enclose) [[unlikely]] {
-                _state.forward_to(ReaderState::MATCH_ENCLOSE);
-                ++_idx;
-                return;
+            } else if (_quote_escape) {
+                if (start[_idx] == _enclose) {
+                    // double quote, escaped by quote
+                    _quote_escape = false;
+                } else {
+                    // match enclose
+                    _quote_escape = false;
+                    _state.forward_to(ReaderState::MATCH_ENCLOSE);
+                    return;
+                }
+            } else if (start[_idx] == _enclose) {
+                _quote_escape = true;
+            } else {
+                _quote_escape = false;
             }
+
             ++_idx;
         } while (_idx != len);
 
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h 
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
index e1c30607e1b..9f861ae972c 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
@@ -220,6 +220,7 @@ public:
     inline void refresh_impl() {
         _idx = 0;
         _should_escape = false;
+        _quote_escape = false;
         _result = nullptr;
         _column_sep_positions.clear();
         _state.reset();
@@ -254,6 +255,8 @@ private:
 
     size_t _idx = 0;
     bool _should_escape = false;
+    // quote is specially escaped by quote in csv format
+    bool _quote_escape = false;
 
     const std::string _column_sep;
     std::vector<size_t> _column_sep_positions;
diff --git a/be/test/vec/data_types/data_type_string_test.cpp 
b/be/test/vec/data_types/data_type_string_test.cpp
index 0057f6006a8..e5fe1cff6a2 100644
--- a/be/test/vec/data_types/data_type_string_test.cpp
+++ b/be/test/vec/data_types/data_type_string_test.cpp
@@ -40,6 +40,7 @@
 #include "vec/data_types/data_type.h"
 #include "vec/data_types/data_type_factory.hpp"
 #include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/serde/data_type_string_serde.h"
 #include "vec/io/reader_buffer.h"
 
 namespace doris::vectorized {
@@ -337,4 +338,63 @@ TEST_F(DataTypeStringTest, get_field) {
     expr_node.string_literal.value = "a";
     EXPECT_EQ(dt_str.get_field(expr_node), 
Field::create_field<TYPE_STRING>("a"));
 }
+TEST_F(DataTypeStringTest, escape_string) {
+    {
+        char test_str[] = "hello\\world";
+        size_t len = strlen(test_str);
+        escape_string(test_str, len, '\\');
+        EXPECT_EQ(std::string(test_str, len), "helloworld");
+    }
+    {
+        char test_str[] = "helloworld";
+        size_t len = strlen(test_str);
+        escape_string(test_str, len, '\\');
+        EXPECT_EQ(std::string(test_str, len), "helloworld");
+    }
+    {
+        char test_str[] = R"(hello\\world)";
+        size_t len = strlen(test_str);
+        escape_string(test_str, len, '\\');
+        EXPECT_EQ(std::string(test_str, len), R"(hello\world)");
+    }
+    {
+        char test_str[] = R"(\\hello\\)";
+        size_t len = strlen(test_str);
+        escape_string(test_str, len, '\\');
+        EXPECT_EQ(std::string(test_str, len), R"(\hello\)");
+    }
+}
+
+TEST_F(DataTypeStringTest, escape_string_for_csv) {
+    {
+        char test_str[] = R"(hello""world)";
+        size_t len = strlen(test_str);
+        escape_string_for_csv(test_str, len, '\\', '"');
+        EXPECT_EQ(std::string(test_str, len), R"(hello"world)");
+    }
+    {
+        char test_str[] = "helloworld";
+        size_t len = strlen(test_str);
+        escape_string_for_csv(test_str, len, '\\', '"');
+        EXPECT_EQ(std::string(test_str, len), "helloworld");
+    }
+    {
+        char test_str[] = R"("hello""world")";
+        size_t len = strlen(test_str);
+        escape_string_for_csv(test_str, len, '\\', '"');
+        EXPECT_EQ(std::string(test_str, len), R"("hello"world")");
+    }
+    {
+        char test_str[] = R"(\\"hello\\""world\\)";
+        size_t len = strlen(test_str);
+        escape_string_for_csv(test_str, len, '\\', '"');
+        EXPECT_EQ(std::string(test_str, len), R"(\"hello\"world\)");
+    }
+    {
+        char test_str[] = "";
+        size_t len = strlen(test_str);
+        escape_string_for_csv(test_str, len, '\\', '"');
+        EXPECT_EQ(std::string(test_str, len), "");
+    }
+}
 } // namespace doris::vectorized
\ No newline at end of file
diff --git 
a/be/test/vec/exec/format/file_reader/new_plain_text_line_reader_test.cpp 
b/be/test/vec/exec/format/file_reader/new_plain_text_line_reader_test.cpp
new file mode 100644
index 00000000000..2cc2796048e
--- /dev/null
+++ b/be/test/vec/exec/format/file_reader/new_plain_text_line_reader_test.cpp
@@ -0,0 +1,168 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// be/test/vec/exec/format/file_reader/new_plain_text_line_reader_test.cpp
+
+#include "vec/exec/format/file_reader/new_plain_text_line_reader.h"
+
+#include <gtest/gtest.h>
+
+namespace doris::vectorized {
+
+// Base test class for text line reader tests
+class PlainTextLineReaderTest : public testing::Test {
+protected:
+    // Helper function to verify line splitting results
+    void verify_split_result(const std::string& input, const std::string& 
line_delim, bool keep_cr,
+                             const std::vector<std::string>& expected_lines) {
+        PlainTextLineReaderCtx ctx(line_delim, line_delim.size(), keep_cr);
+        const auto* data = reinterpret_cast<const uint8_t*>(input.c_str());
+        size_t pos = 0;
+        size_t size = input.size();
+        std::vector<std::string> actual_lines;
+
+        while (pos < size) {
+            ctx.refresh();
+            const auto* line_end = ctx.read_line(data + pos, size - pos);
+            if (!line_end) {
+                actual_lines.emplace_back(reinterpret_cast<const char*>(data + 
pos), size - pos);
+                break;
+            }
+            size_t line_len = line_end - (data + pos);
+            actual_lines.emplace_back(reinterpret_cast<const char*>(data + 
pos), line_len);
+            pos += line_len + ctx.line_delimiter_length();
+        }
+
+        ASSERT_EQ(expected_lines, actual_lines);
+    }
+};
+
+// Test cases for PlainTextLineReaderCtx
+TEST_F(PlainTextLineReaderTest, PlainTextBasic) {
+    verify_split_result("line1\nline2\nline3", "\n", false, {"line1", "line2", 
"line3"});
+
+    verify_split_result("line1\r\nline2\r\nline3", "\r\n", false, {"line1", 
"line2", "line3"});
+
+    verify_split_result("line1\r\nline2\r\nline3", "\n", true, {"line1\r", 
"line2\r", "line3"});
+
+    verify_split_result("line1\n\nline3", "\n", false, {"line1", "", "line3"});
+
+    verify_split_result("line1||line2||line3", "||", false, {"line1", "line2", 
"line3"});
+}
+
+// Test class for CSV line reader with enclosure support
+class EncloseCsvLineReaderTest : public testing::Test {
+protected:
+    // Helper function to verify CSV splitting results including column 
positions
+    void verify_csv_split(const std::string& input, const std::string& 
line_delim,
+                          const std::string& col_sep, char enclose, char 
escape, bool keep_cr,
+                          const std::vector<std::string>& expected_lines,
+                          const std::vector<std::vector<size_t>>& 
expected_col_positions) {
+        EncloseCsvLineReaderContext ctx(line_delim, line_delim.size(), 
col_sep, col_sep.size(), 10,
+                                        enclose, escape, keep_cr);
+
+        const auto* data = reinterpret_cast<const uint8_t*>(input.c_str());
+        size_t pos = 0;
+        size_t size = input.size();
+        std::vector<std::string> actual_lines;
+        std::vector<std::vector<size_t>> actual_col_positions;
+
+        while (pos < size) {
+            ctx.refresh();
+            const uint8_t* line_end = ctx.read_line(data + pos, size - pos);
+            if (!line_end) {
+                actual_lines.emplace_back(reinterpret_cast<const char*>(data + 
pos), size - pos);
+                actual_col_positions.push_back(ctx.column_sep_positions());
+                break;
+            }
+            size_t line_len = line_end - (data + pos);
+            actual_lines.emplace_back(reinterpret_cast<const char*>(data + 
pos), line_len);
+            actual_col_positions.push_back(ctx.column_sep_positions());
+            pos += line_len + ctx.line_delimiter_length();
+        }
+
+        ASSERT_EQ(expected_lines, actual_lines);
+        ASSERT_EQ(expected_col_positions, actual_col_positions);
+    }
+};
+
+// Basic CSV format test cases
+TEST_F(EncloseCsvLineReaderTest, CsvBasic) {
+    verify_csv_split("a,b,c\nd,e,f", "\n", ",", '"', '\\', false, {"a,b,c", 
"d,e,f"},
+                     {{1, 3}, {1, 3}});
+
+    verify_csv_split("\"a,x\",b,c\n\"d,y\",e,f", "\n", ",", '"', '\\', false,
+                     {"\"a,x\",b,c", "\"d,y\",e,f"}, {{5, 7}, {5, 7}});
+
+    verify_csv_split("\"a\"\"x\",b,c\n\"d\\\"y\",e,f", "\n", ",", '"', '\\', 
false,
+                     {R"("a""x",b,c)", R"("d\"y",e,f)"}, {{6, 8}, {6, 8}});
+
+    verify_csv_split("a||b||c\nd||e||f", "\n", "||", '"', '\\', false, 
{"a||b||c", "d||e||f"},
+                     {{1, 4}, {1, 4}});
+}
+
+// Edge cases and corner scenarios
+TEST_F(EncloseCsvLineReaderTest, EdgeCases) {
+    verify_csv_split("\n\na,b,c", "\n", ",", '"', '\\', false, {"", "", 
"a,b,c"}, {{}, {}, {1, 3}});
+
+    verify_csv_split("\"abc,def\nghi,jkl", "\n", ",", '"', '\\', false, 
{"\"abc,def\nghi,jkl"},
+                     {{}});
+
+    verify_csv_split("a,b\r\nc,d\ne,f", "\r\n", ",", '"', '\\', false, {"a,b", 
"c,d\ne,f"},
+                     {{1}, {1, 5}});
+
+    verify_csv_split(R"(\,\"\n,b,c)", "\n", ",", '"', '\\', false, 
{R"(\,\"\n,b,c)"}, {{1, 6, 8}});
+}
+
+TEST_F(EncloseCsvLineReaderTest, QuoteEscaping) {
+    // Test multiple quoted fields with double-quote escaping in one line
+    verify_csv_split(R"("hello ""world\n""","foo ""bar""","test ""quote"" 
here")", "\n", ",", '"',
+                     '\\', false, {R"("hello ""world\n""","foo ""bar""","test 
""quote"" here")"},
+                     {{19, 33}});
+
+    // Test JSON-like string with escaped quotes
+    verify_csv_split(
+            R"({""code"": ""100"", ""message"": ""query success"", ""data"": 
{""status"": ""1""}})",
+            "\n", ",", '"', '\\', false,
+            {R"({""code"": ""100"", ""message"": ""query success"", ""data"": 
{""status"": ""1""}})"},
+            {{18, 50}});
+
+    // Test custom enclose character
+    verify_csv_split(R"({|code|: |100|, |message|: |query success|, |data|: 
{|status|: |1|}})",
+                     "\n", ",", '|', '\\', false,
+                     {R"({|code|: |100|, |message|: |query success|, |data|: 
{|status|: |1|}})"},
+                     {{14, 42}});
+}
+
+TEST_F(EncloseCsvLineReaderTest, MultiCharDelimiters) {
+    // Test multi-character line delimiter
+    verify_csv_split("a,b,c\r\n\nd,e,f", "\r\n\n", ",", '"', '\\', false, 
{"a,b,c", "d,e,f"},
+                     {{1, 3}, {1, 3}});
+
+    // Test multi-character column delimiter
+    verify_csv_split("a|||b|||c\nd|||e|||f", "\n", "|||", '"', '\\', false,
+                     {"a|||b|||c", "d|||e|||f"}, {{1, 5}, {1, 5}});
+
+    // Test both multi-character line and column delimiters
+    verify_csv_split("a|||b|||c\r\n\nd|||e|||f", "\r\n\n", "|||", '"', '\\', 
false,
+                     {"a|||b|||c", "d|||e|||f"}, {{1, 5}, {1, 5}});
+
+    verify_csv_split("\"a|||b\"|||c\r\n\n\"d|||e\"|||f", "\r\n\n", "|||", '"', 
'\\', false,
+                     {"\"a|||b\"|||c", "\"d|||e\"|||f"}, {{7}, {7}});
+}
+
+} // namespace doris::vectorized
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
index c003c9e7d50..fe11a46067c 100755
--- 
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
@@ -1,3 +1,4 @@
+create database if not exists multi_catalog;
 use multi_catalog;
 
 CREATE TABLE text_table_normal_skip_header (
@@ -20,6 +21,15 @@ STORED AS TEXTFILE
 LOCATION '/user/doris/preinstalled_data/text/text_table_compressed_skip_header'
 TBLPROPERTIES ("skip.header.line.count"="5");
 
+CREATE TABLE csv_json_table_simple (
+  id STRING,
+  status_json STRING
+)
+ROW FORMAT SERDE 
+  'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+STORED AS TEXTFILE
+LOCATION '/user/doris/preinstalled_data/csv/csv_json_table_simple';
+
 create database if not exists openx_json;
 use openx_json;
 
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table_simple/csv_json_table_simple.csv
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table_simple/csv_json_table_simple.csv
new file mode 100644
index 00000000000..7e17354e161
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table_simple/csv_json_table_simple.csv
@@ -0,0 +1,10 @@
+"1001","{""code"": ""100"", ""message"": ""query success"", ""data"": 
{""status"": ""1""}}"
+"1002","{""code"": ""100"", ""message"": ""query success"", ""data"": 
{""status"": ""20""}}"
+"1003","{""code"": ""100"", ""message"": ""query success"", ""data"": 
{""status"": ""1""}}"
+"1004","{""code"": ""200"", ""message"": ""query failed"", ""data"": {}}"
+"1005","{""code"": ""100"", ""message"": ""query success"", ""data"": 
{""status"": ""20""}}"
+"1006","{""code"": ""100"", ""message"": ""query success"", ""data"": 
{""status"": ""1""}}"
+"1007","{""code"": ""300"", ""message"": ""invalid request"", ""data"": null}"
+"1008","{""code"": ""100"", ""message"": ""query success"", ""data"": 
{""status"": ""0""}}"
+"1009","{""code"": ""100"", ""message"": ""query success"", ""data"": 
{""status"": ""1""}}"
+"1010","{""code"": ""100"", ""message"": ""query success"", ""data"": 
{""status"": ""20""}}"
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out 
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
index fd8e7bb4313..c2415c058f1 100644
Binary files 
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out and 
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out differ
diff --git 
a/regression-test/data/external_table_p0/hive/test_open_csv_serde.out 
b/regression-test/data/external_table_p0/hive/test_open_csv_serde.out
new file mode 100644
index 00000000000..a071f731f45
Binary files /dev/null and 
b/regression-test/data/external_table_p0/hive/test_open_csv_serde.out differ
diff --git 
a/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
 
b/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
index 0c1450e35fe..8d4444ac418 100644
Binary files 
a/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
 and 
b/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
 differ
diff --git 
a/regression-test/data/load_p0/stream_load/enclose_with_escape_doublequote.csv 
b/regression-test/data/load_p0/stream_load/enclose_with_escape_doublequote.csv
new file mode 100644
index 00000000000..41a0d65cf26
--- /dev/null
+++ 
b/regression-test/data/load_p0/stream_load/enclose_with_escape_doublequote.csv
@@ -0,0 +1 @@
+5,"abc""de,fg""h","2023-07-17","""abc""def""","2023-07-22:07:00:00","test 
""escape"" in enclose"
diff --git 
a/regression-test/data/load_p0/stream_load/enclose_with_escape_quote.csv 
b/regression-test/data/load_p0/stream_load/enclose_with_escape_quote.csv
new file mode 100644
index 00000000000..0ade6f86900
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/enclose_with_escape_quote.csv
@@ -0,0 +1 @@
+4,"abc\"de,fg\"h","2023-07-16","\"abc\"def\"","2023-07-21:06:00:00","test 
\"escape\" in enclose"
diff --git 
a/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out 
b/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
index 5646d96230f..d4509165cf4 100644
Binary files 
a/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out 
and 
b/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out 
differ
diff --git 
a/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy 
b/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy
new file mode 100644
index 00000000000..c3b5794323d
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_open_csv_serde","p0,external,tvf,hive,external_docker,external_docker_hive")
 {
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("diable Hive test.")
+        return;
+    }
+
+    for (String hivePrefix : ["hive2","hive3"]) {
+    
+        String hms_port = context.config.otherConfigs.get(hivePrefix + 
"HmsPort")
+        String catalog_name = "${hivePrefix}_test_open_csv_serde"
+        String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+        def hdfsUserName = "doris"
+        String hdfs_port = context.config.otherConfigs.get(hivePrefix + 
"HdfsPort")
+        def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+
+        sql """drop catalog if exists ${catalog_name}"""
+        sql """create catalog if not exists ${catalog_name} properties (
+            "type"="hms",
+            'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+        );"""
+        sql """use `${catalog_name}`.`multi_catalog`"""
+
+        qt_csv_escape_quote_in_enclose """select * from csv_json_table_simple 
order by id;"""
+        // TODO: add more case after refactor csv_reader and text_reader
+    }
+}
\ No newline at end of file
diff --git 
a/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
 
b/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
index 1562fa35cfd..5625a7e6de6 100644
--- 
a/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
+++ 
b/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
@@ -1,4 +1,3 @@
-
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
@@ -38,6 +37,8 @@ suite("test_csv_with_enclose_and_escape", "p0") {
     def normalCases = [
             'enclose_normal.csv',
             'enclose_with_escape.csv',
+            'enclose_with_escape_quote.csv',
+            'enclose_with_escape_doublequote.csv',
             'enclose_wrong_position.csv',
             'enclose_empty_values.csv'
     ]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch master updated: [fix](csv) escape quote with double quote for csv format table (#50101)

Reply via email to