This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 084c8023ada [fix](csv) escape quote with double quote for csv format
table (#50101)
084c8023ada is described below
commit 084c8023ada3fe98cf19f347d36b1dbcb3a8fbb8
Author: Socrates <[email protected]>
AuthorDate: Wed Jun 4 15:33:00 2025 +0800
[fix](csv) escape quote with double quote for csv format table (#50101)
### What problem does this PR solve?
Problem Summary:
According to the CSV standard format definition, quote characters inside
a string should be escaped using a pair of quote characters. However,
the current implementation does not handle this case correctly, which
may lead to incorrect parsing results when the input string contains
quote characters.
---
be/src/vec/data_types/serde/data_type_serde.h | 10 ++
.../vec/data_types/serde/data_type_string_serde.h | 34 +++++
be/src/vec/exec/format/csv/csv_reader.cpp | 7 +-
.../file_reader/new_plain_text_line_reader.cpp | 19 ++-
.../file_reader/new_plain_text_line_reader.h | 3 +
be/test/vec/data_types/data_type_string_test.cpp | 60 ++++++++
.../new_plain_text_line_reader_test.cpp | 168 +++++++++++++++++++++
.../scripts/create_preinstalled_scripts/run76.hql | 10 ++
.../csv_json_table_simple.csv | 10 ++
.../hive/test_hive_serde_prop.out | Bin 1538 -> 1534 bytes
.../external_table_p0/hive/test_open_csv_serde.out | Bin 0 -> 1607 bytes
.../test_csv_with_enclose_and_escapeS3_load.out | Bin 743 -> 743 bytes
.../enclose_with_escape_doublequote.csv | 1 +
.../stream_load/enclose_with_escape_quote.csv | 1 +
.../test_csv_with_enclose_and_escape.out | Bin 709 -> 863 bytes
.../hive/test_open_csv_serde.groovy | 45 ++++++
.../test_csv_with_enclose_and_escape.groovy | 3 +-
17 files changed, 363 insertions(+), 8 deletions(-)
diff --git a/be/src/vec/data_types/serde/data_type_serde.h
b/be/src/vec/data_types/serde/data_type_serde.h
index cc9baec4f1e..ae3c0426c70 100644
--- a/be/src/vec/data_types/serde/data_type_serde.h
+++ b/be/src/vec/data_types/serde/data_type_serde.h
@@ -151,6 +151,8 @@ public:
*/
bool converted_from_string = false;
+ char quote_char = '"';
+
char escape_char = 0;
/**
* flags for each byte to indicate if escape is needed.
@@ -267,6 +269,14 @@ public:
virtual Status deserialize_one_cell_from_json(IColumn& column, Slice&
slice,
const FormatOptions&
options) const = 0;
+
+ // In some cases, CSV and JSON deserialization behaviors may differ
+ // so we provide a default implementation that uses JSON deserialization
+ virtual Status deserialize_one_cell_from_csv(IColumn& column, Slice& slice,
+ const FormatOptions& options)
const {
+ return deserialize_one_cell_from_json(column, slice, options);
+ }
+
// deserialize text vector is to avoid virtual function call in complex
type nested loop
virtual Status deserialize_column_from_json_vector(IColumn& column,
std::vector<Slice>& slices,
uint64_t*
num_deserialized,
diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h
b/be/src/vec/data_types/serde/data_type_string_serde.h
index 09d6762e43c..b1b76ed0e87 100644
--- a/be/src/vec/data_types/serde/data_type_string_serde.h
+++ b/be/src/vec/data_types/serde/data_type_string_serde.h
@@ -64,6 +64,31 @@ inline void escape_string(const char* src, size_t& len, char
escape_char) {
len = dest_ptr - start;
}
+// specially escape quote with double quote
+inline void escape_string_for_csv(const char* src, size_t& len, char
escape_char, char quote_char) {
+ const char* start = src;
+ char* dest_ptr = const_cast<char*>(src);
+ const char* end = src + len;
+ bool escape_next_char = false;
+
+ while (src < end) {
+ if ((src < end - 1 && *src == quote_char && *(src + 1) == quote_char)
||
+ *src == escape_char) {
+ escape_next_char = !escape_next_char;
+ } else {
+ escape_next_char = false;
+ }
+
+ if (escape_next_char) {
+ ++src;
+ } else {
+ *dest_ptr++ = *src++;
+ }
+ }
+
+ len = dest_ptr - start;
+}
+
template <typename ColumnType>
class DataTypeStringSerDeBase : public DataTypeSerDe {
using ColumnStrType = ColumnType;
@@ -189,6 +214,15 @@ public:
return Status::OK();
}
+ Status deserialize_one_cell_from_csv(IColumn& column, Slice& slice,
+ const FormatOptions& options) const
override {
+ if (options.escape_char != 0) {
+ escape_string_for_csv(slice.data, slice.size, options.escape_char,
options.quote_char);
+ }
+ assert_cast<ColumnType&>(column).insert_data(slice.data, slice.size);
+ return Status::OK();
+ }
+
Status deserialize_one_cell_from_hive_text(
IColumn& column, Slice& slice, const FormatOptions& options,
int hive_text_complex_type_delimiter_level = 1) const override {
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 5af50597ce3..6a25d6f3687 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -337,6 +337,7 @@ Status CsvReader::init_reader(bool is_load) {
(_state != nullptr &&
_state->trim_tailing_spaces_for_external_table_query());
_options.escape_char = _escape;
+ _options.quote_char = _enclose;
if (_params.file_attributes.text_params.collection_delimiter.empty()) {
switch (_text_serde_type) {
case TTextSerdeType::JSON_TEXT_SERDE:
@@ -623,8 +624,8 @@ Status CsvReader::deserialize_nullable_string(IColumn&
column, Slice& slice) {
}
}
static DataTypeStringSerDe stringSerDe;
- auto st =
stringSerDe.deserialize_one_cell_from_json(null_column.get_nested_column(),
slice,
- _options);
+ auto st =
stringSerDe.deserialize_one_cell_from_csv(null_column.get_nested_column(),
slice,
+ _options);
if (!st.ok()) {
// fill null if fail
null_column.insert_data(nullptr, 0); // 0 is meaningless here
@@ -677,7 +678,7 @@ Status CsvReader::_fill_dest_columns(const Slice& line,
Block* block,
switch (_text_serde_type) {
case TTextSerdeType::JSON_TEXT_SERDE:
RETURN_IF_ERROR(
- _serdes[i]->deserialize_one_cell_from_json(*col_ptr,
slice, _options));
+ _serdes[i]->deserialize_one_cell_from_csv(*col_ptr,
slice, _options));
break;
case TTextSerdeType::HIVE_TEXT_SERDE:
RETURN_IF_ERROR(
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
index 640f70f134c..ec1c5b8d299 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
@@ -149,11 +149,22 @@ void
EncloseCsvLineReaderContext::_on_pre_match_enclose(const uint8_t* start, si
_should_escape = !_should_escape;
} else if (_should_escape) [[unlikely]] {
_should_escape = false;
- } else if (start[_idx] == _enclose) [[unlikely]] {
- _state.forward_to(ReaderState::MATCH_ENCLOSE);
- ++_idx;
- return;
+ } else if (_quote_escape) {
+ if (start[_idx] == _enclose) {
+ // double quote, escaped by quote
+ _quote_escape = false;
+ } else {
+ // match enclose
+ _quote_escape = false;
+ _state.forward_to(ReaderState::MATCH_ENCLOSE);
+ return;
+ }
+ } else if (start[_idx] == _enclose) {
+ _quote_escape = true;
+ } else {
+ _quote_escape = false;
}
+
++_idx;
} while (_idx != len);
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
index e1c30607e1b..9f861ae972c 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
@@ -220,6 +220,7 @@ public:
inline void refresh_impl() {
_idx = 0;
_should_escape = false;
+ _quote_escape = false;
_result = nullptr;
_column_sep_positions.clear();
_state.reset();
@@ -254,6 +255,8 @@ private:
size_t _idx = 0;
bool _should_escape = false;
+ // quote is specially escaped by quote in csv format
+ bool _quote_escape = false;
const std::string _column_sep;
std::vector<size_t> _column_sep_positions;
diff --git a/be/test/vec/data_types/data_type_string_test.cpp
b/be/test/vec/data_types/data_type_string_test.cpp
index 0057f6006a8..e5fe1cff6a2 100644
--- a/be/test/vec/data_types/data_type_string_test.cpp
+++ b/be/test/vec/data_types/data_type_string_test.cpp
@@ -40,6 +40,7 @@
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_factory.hpp"
#include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/serde/data_type_string_serde.h"
#include "vec/io/reader_buffer.h"
namespace doris::vectorized {
@@ -337,4 +338,63 @@ TEST_F(DataTypeStringTest, get_field) {
expr_node.string_literal.value = "a";
EXPECT_EQ(dt_str.get_field(expr_node),
Field::create_field<TYPE_STRING>("a"));
}
+TEST_F(DataTypeStringTest, escape_string) {
+ {
+ char test_str[] = "hello\\world";
+ size_t len = strlen(test_str);
+ escape_string(test_str, len, '\\');
+ EXPECT_EQ(std::string(test_str, len), "helloworld");
+ }
+ {
+ char test_str[] = "helloworld";
+ size_t len = strlen(test_str);
+ escape_string(test_str, len, '\\');
+ EXPECT_EQ(std::string(test_str, len), "helloworld");
+ }
+ {
+ char test_str[] = R"(hello\\world)";
+ size_t len = strlen(test_str);
+ escape_string(test_str, len, '\\');
+ EXPECT_EQ(std::string(test_str, len), R"(hello\world)");
+ }
+ {
+ char test_str[] = R"(\\hello\\)";
+ size_t len = strlen(test_str);
+ escape_string(test_str, len, '\\');
+ EXPECT_EQ(std::string(test_str, len), R"(\hello\)");
+ }
+}
+
+TEST_F(DataTypeStringTest, escape_string_for_csv) {
+ {
+ char test_str[] = R"(hello""world)";
+ size_t len = strlen(test_str);
+ escape_string_for_csv(test_str, len, '\\', '"');
+ EXPECT_EQ(std::string(test_str, len), R"(hello"world)");
+ }
+ {
+ char test_str[] = "helloworld";
+ size_t len = strlen(test_str);
+ escape_string_for_csv(test_str, len, '\\', '"');
+ EXPECT_EQ(std::string(test_str, len), "helloworld");
+ }
+ {
+ char test_str[] = R"("hello""world")";
+ size_t len = strlen(test_str);
+ escape_string_for_csv(test_str, len, '\\', '"');
+ EXPECT_EQ(std::string(test_str, len), R"("hello"world")");
+ }
+ {
+ char test_str[] = R"(\\"hello\\""world\\)";
+ size_t len = strlen(test_str);
+ escape_string_for_csv(test_str, len, '\\', '"');
+ EXPECT_EQ(std::string(test_str, len), R"(\"hello\"world\)");
+ }
+ {
+ char test_str[] = "";
+ size_t len = strlen(test_str);
+ escape_string_for_csv(test_str, len, '\\', '"');
+ EXPECT_EQ(std::string(test_str, len), "");
+ }
+}
} // namespace doris::vectorized
\ No newline at end of file
diff --git
a/be/test/vec/exec/format/file_reader/new_plain_text_line_reader_test.cpp
b/be/test/vec/exec/format/file_reader/new_plain_text_line_reader_test.cpp
new file mode 100644
index 00000000000..2cc2796048e
--- /dev/null
+++ b/be/test/vec/exec/format/file_reader/new_plain_text_line_reader_test.cpp
@@ -0,0 +1,168 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// be/test/vec/exec/format/file_reader/new_plain_text_line_reader_test.cpp
+
+#include "vec/exec/format/file_reader/new_plain_text_line_reader.h"
+
+#include <gtest/gtest.h>
+
+namespace doris::vectorized {
+
+// Base test class for text line reader tests
+class PlainTextLineReaderTest : public testing::Test {
+protected:
+ // Helper function to verify line splitting results
+ void verify_split_result(const std::string& input, const std::string&
line_delim, bool keep_cr,
+ const std::vector<std::string>& expected_lines) {
+ PlainTextLineReaderCtx ctx(line_delim, line_delim.size(), keep_cr);
+ const auto* data = reinterpret_cast<const uint8_t*>(input.c_str());
+ size_t pos = 0;
+ size_t size = input.size();
+ std::vector<std::string> actual_lines;
+
+ while (pos < size) {
+ ctx.refresh();
+ const auto* line_end = ctx.read_line(data + pos, size - pos);
+ if (!line_end) {
+ actual_lines.emplace_back(reinterpret_cast<const char*>(data +
pos), size - pos);
+ break;
+ }
+ size_t line_len = line_end - (data + pos);
+ actual_lines.emplace_back(reinterpret_cast<const char*>(data +
pos), line_len);
+ pos += line_len + ctx.line_delimiter_length();
+ }
+
+ ASSERT_EQ(expected_lines, actual_lines);
+ }
+};
+
+// Test cases for PlainTextLineReaderCtx
+TEST_F(PlainTextLineReaderTest, PlainTextBasic) {
+ verify_split_result("line1\nline2\nline3", "\n", false, {"line1", "line2",
"line3"});
+
+ verify_split_result("line1\r\nline2\r\nline3", "\r\n", false, {"line1",
"line2", "line3"});
+
+ verify_split_result("line1\r\nline2\r\nline3", "\n", true, {"line1\r",
"line2\r", "line3"});
+
+ verify_split_result("line1\n\nline3", "\n", false, {"line1", "", "line3"});
+
+ verify_split_result("line1||line2||line3", "||", false, {"line1", "line2",
"line3"});
+}
+
+// Test class for CSV line reader with enclosure support
+class EncloseCsvLineReaderTest : public testing::Test {
+protected:
+ // Helper function to verify CSV splitting results including column
positions
+ void verify_csv_split(const std::string& input, const std::string&
line_delim,
+ const std::string& col_sep, char enclose, char
escape, bool keep_cr,
+ const std::vector<std::string>& expected_lines,
+ const std::vector<std::vector<size_t>>&
expected_col_positions) {
+ EncloseCsvLineReaderContext ctx(line_delim, line_delim.size(),
col_sep, col_sep.size(), 10,
+ enclose, escape, keep_cr);
+
+ const auto* data = reinterpret_cast<const uint8_t*>(input.c_str());
+ size_t pos = 0;
+ size_t size = input.size();
+ std::vector<std::string> actual_lines;
+ std::vector<std::vector<size_t>> actual_col_positions;
+
+ while (pos < size) {
+ ctx.refresh();
+ const uint8_t* line_end = ctx.read_line(data + pos, size - pos);
+ if (!line_end) {
+ actual_lines.emplace_back(reinterpret_cast<const char*>(data +
pos), size - pos);
+ actual_col_positions.push_back(ctx.column_sep_positions());
+ break;
+ }
+ size_t line_len = line_end - (data + pos);
+ actual_lines.emplace_back(reinterpret_cast<const char*>(data +
pos), line_len);
+ actual_col_positions.push_back(ctx.column_sep_positions());
+ pos += line_len + ctx.line_delimiter_length();
+ }
+
+ ASSERT_EQ(expected_lines, actual_lines);
+ ASSERT_EQ(expected_col_positions, actual_col_positions);
+ }
+};
+
+// Basic CSV format test cases
+TEST_F(EncloseCsvLineReaderTest, CsvBasic) {
+ verify_csv_split("a,b,c\nd,e,f", "\n", ",", '"', '\\', false, {"a,b,c",
"d,e,f"},
+ {{1, 3}, {1, 3}});
+
+ verify_csv_split("\"a,x\",b,c\n\"d,y\",e,f", "\n", ",", '"', '\\', false,
+ {"\"a,x\",b,c", "\"d,y\",e,f"}, {{5, 7}, {5, 7}});
+
+ verify_csv_split("\"a\"\"x\",b,c\n\"d\\\"y\",e,f", "\n", ",", '"', '\\',
false,
+ {R"("a""x",b,c)", R"("d\"y",e,f)"}, {{6, 8}, {6, 8}});
+
+ verify_csv_split("a||b||c\nd||e||f", "\n", "||", '"', '\\', false,
{"a||b||c", "d||e||f"},
+ {{1, 4}, {1, 4}});
+}
+
+// Edge cases and corner scenarios
+TEST_F(EncloseCsvLineReaderTest, EdgeCases) {
+ verify_csv_split("\n\na,b,c", "\n", ",", '"', '\\', false, {"", "",
"a,b,c"}, {{}, {}, {1, 3}});
+
+ verify_csv_split("\"abc,def\nghi,jkl", "\n", ",", '"', '\\', false,
{"\"abc,def\nghi,jkl"},
+ {{}});
+
+ verify_csv_split("a,b\r\nc,d\ne,f", "\r\n", ",", '"', '\\', false, {"a,b",
"c,d\ne,f"},
+ {{1}, {1, 5}});
+
+ verify_csv_split(R"(\,\"\n,b,c)", "\n", ",", '"', '\\', false,
{R"(\,\"\n,b,c)"}, {{1, 6, 8}});
+}
+
+TEST_F(EncloseCsvLineReaderTest, QuoteEscaping) {
+ // Test multiple quoted fields with double-quote escaping in one line
+ verify_csv_split(R"("hello ""world\n""","foo ""bar""","test ""quote""
here")", "\n", ",", '"',
+ '\\', false, {R"("hello ""world\n""","foo ""bar""","test
""quote"" here")"},
+ {{19, 33}});
+
+ // Test JSON-like string with escaped quotes
+ verify_csv_split(
+ R"({""code"": ""100"", ""message"": ""query success"", ""data"":
{""status"": ""1""}})",
+ "\n", ",", '"', '\\', false,
+ {R"({""code"": ""100"", ""message"": ""query success"", ""data"":
{""status"": ""1""}})"},
+ {{18, 50}});
+
+ // Test custom enclose character
+ verify_csv_split(R"({|code|: |100|, |message|: |query success|, |data|:
{|status|: |1|}})",
+ "\n", ",", '|', '\\', false,
+ {R"({|code|: |100|, |message|: |query success|, |data|:
{|status|: |1|}})"},
+ {{14, 42}});
+}
+
+TEST_F(EncloseCsvLineReaderTest, MultiCharDelimiters) {
+ // Test multi-character line delimiter
+ verify_csv_split("a,b,c\r\n\nd,e,f", "\r\n\n", ",", '"', '\\', false,
{"a,b,c", "d,e,f"},
+ {{1, 3}, {1, 3}});
+
+ // Test multi-character column delimiter
+ verify_csv_split("a|||b|||c\nd|||e|||f", "\n", "|||", '"', '\\', false,
+ {"a|||b|||c", "d|||e|||f"}, {{1, 5}, {1, 5}});
+
+ // Test both multi-character line and column delimiters
+ verify_csv_split("a|||b|||c\r\n\nd|||e|||f", "\r\n\n", "|||", '"', '\\',
false,
+ {"a|||b|||c", "d|||e|||f"}, {{1, 5}, {1, 5}});
+
+ verify_csv_split("\"a|||b\"|||c\r\n\n\"d|||e\"|||f", "\r\n\n", "|||", '"',
'\\', false,
+ {"\"a|||b\"|||c", "\"d|||e\"|||f"}, {{7}, {7}});
+}
+
+} // namespace doris::vectorized
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
index c003c9e7d50..fe11a46067c 100755
---
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
+++
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
@@ -1,3 +1,4 @@
+create database if not exists multi_catalog;
use multi_catalog;
CREATE TABLE text_table_normal_skip_header (
@@ -20,6 +21,15 @@ STORED AS TEXTFILE
LOCATION '/user/doris/preinstalled_data/text/text_table_compressed_skip_header'
TBLPROPERTIES ("skip.header.line.count"="5");
+CREATE TABLE csv_json_table_simple (
+ id STRING,
+ status_json STRING
+)
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+STORED AS TEXTFILE
+LOCATION '/user/doris/preinstalled_data/csv/csv_json_table_simple';
+
create database if not exists openx_json;
use openx_json;
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table_simple/csv_json_table_simple.csv
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table_simple/csv_json_table_simple.csv
new file mode 100644
index 00000000000..7e17354e161
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/csv/csv_json_table_simple/csv_json_table_simple.csv
@@ -0,0 +1,10 @@
+"1001","{""code"": ""100"", ""message"": ""query success"", ""data"":
{""status"": ""1""}}"
+"1002","{""code"": ""100"", ""message"": ""query success"", ""data"":
{""status"": ""20""}}"
+"1003","{""code"": ""100"", ""message"": ""query success"", ""data"":
{""status"": ""1""}}"
+"1004","{""code"": ""200"", ""message"": ""query failed"", ""data"": {}}"
+"1005","{""code"": ""100"", ""message"": ""query success"", ""data"":
{""status"": ""20""}}"
+"1006","{""code"": ""100"", ""message"": ""query success"", ""data"":
{""status"": ""1""}}"
+"1007","{""code"": ""300"", ""message"": ""invalid request"", ""data"": null}"
+"1008","{""code"": ""100"", ""message"": ""query success"", ""data"":
{""status"": ""0""}}"
+"1009","{""code"": ""100"", ""message"": ""query success"", ""data"":
{""status"": ""1""}}"
+"1010","{""code"": ""100"", ""message"": ""query success"", ""data"":
{""status"": ""20""}}"
diff --git
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out
index fd8e7bb4313..c2415c058f1 100644
Binary files
a/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out and
b/regression-test/data/external_table_p0/hive/test_hive_serde_prop.out differ
diff --git
a/regression-test/data/external_table_p0/hive/test_open_csv_serde.out
b/regression-test/data/external_table_p0/hive/test_open_csv_serde.out
new file mode 100644
index 00000000000..a071f731f45
Binary files /dev/null and
b/regression-test/data/external_table_p0/hive/test_open_csv_serde.out differ
diff --git
a/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
b/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
index 0c1450e35fe..8d4444ac418 100644
Binary files
a/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
and
b/regression-test/data/load_p0/broker_load/test_csv_with_enclose_and_escapeS3_load.out
differ
diff --git
a/regression-test/data/load_p0/stream_load/enclose_with_escape_doublequote.csv
b/regression-test/data/load_p0/stream_load/enclose_with_escape_doublequote.csv
new file mode 100644
index 00000000000..41a0d65cf26
--- /dev/null
+++
b/regression-test/data/load_p0/stream_load/enclose_with_escape_doublequote.csv
@@ -0,0 +1 @@
+5,"abc""de,fg""h","2023-07-17","""abc""def""","2023-07-22:07:00:00","test
""escape"" in enclose"
diff --git
a/regression-test/data/load_p0/stream_load/enclose_with_escape_quote.csv
b/regression-test/data/load_p0/stream_load/enclose_with_escape_quote.csv
new file mode 100644
index 00000000000..0ade6f86900
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/enclose_with_escape_quote.csv
@@ -0,0 +1 @@
+4,"abc\"de,fg\"h","2023-07-16","\"abc\"def\"","2023-07-21:06:00:00","test
\"escape\" in enclose"
diff --git
a/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
b/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
index 5646d96230f..d4509165cf4 100644
Binary files
a/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
and
b/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
differ
diff --git
a/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy
b/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy
new file mode 100644
index 00000000000..c3b5794323d
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_open_csv_serde.groovy
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_open_csv_serde","p0,external,tvf,hive,external_docker,external_docker_hive")
{
+ String enabled = context.config.otherConfigs.get("enableHiveTest")
+ if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+ logger.info("diable Hive test.")
+ return;
+ }
+
+ for (String hivePrefix : ["hive2","hive3"]) {
+
+ String hms_port = context.config.otherConfigs.get(hivePrefix +
"HmsPort")
+ String catalog_name = "${hivePrefix}_test_open_csv_serde"
+ String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+ def hdfsUserName = "doris"
+ String hdfs_port = context.config.otherConfigs.get(hivePrefix +
"HdfsPort")
+ def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+
+ sql """drop catalog if exists ${catalog_name}"""
+ sql """create catalog if not exists ${catalog_name} properties (
+ "type"="hms",
+ 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+ );"""
+ sql """use `${catalog_name}`.`multi_catalog`"""
+
+ qt_csv_escape_quote_in_enclose """select * from csv_json_table_simple
order by id;"""
+ // TODO: add more case after refactor csv_reader and text_reader
+ }
+}
\ No newline at end of file
diff --git
a/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
b/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
index 1562fa35cfd..5625a7e6de6 100644
---
a/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
+++
b/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
@@ -1,4 +1,3 @@
-
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
@@ -38,6 +37,8 @@ suite("test_csv_with_enclose_and_escape", "p0") {
def normalCases = [
'enclose_normal.csv',
'enclose_with_escape.csv',
+ 'enclose_with_escape_quote.csv',
+ 'enclose_with_escape_doublequote.csv',
'enclose_wrong_position.csv',
'enclose_empty_values.csv'
]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]