This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new f764788d8bb branch-3.1: [fix](csv reader) fix csv parse error when use
enclose with multi-char column separator #54581 (#54764)
f764788d8bb is described below
commit f764788d8bb364bc6b5bcdb567f47ff3a504d78b
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri Aug 15 17:40:42 2025 +0800
branch-3.1: [fix](csv reader) fix csv parse error when use enclose with
multi-char column separator #54581 (#54764)
Cherry-picked from #54581
Co-authored-by: hui lai <[email protected]>
---
.../file_reader/new_plain_text_line_reader.cpp | 7 ++-
.../test_csv_big_file_truncate_delimiter.csv.gz | Bin 0 -> 850496 bytes
...stream_load_big_file_with_special_delimiter.out | Bin 113 -> 132 bytes
...eam_load_big_file_with_special_delimiter.groovy | 54 +++++++++++++++++++++
4 files changed, 60 insertions(+), 1 deletion(-)
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
index a068a748b11..80534fb9471 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
@@ -47,6 +47,12 @@
namespace doris {
#include "common/compile_check_begin.h"
const uint8_t* EncloseCsvLineReaderCtx::read_line_impl(const uint8_t* start,
const size_t length) {
+ // Avoid part bytes of the multi-char column separator have already been
parsed,
+ // causing parse column separator error.
+ if (_state.curr_state == ReaderState::NORMAL ||
+ _state.curr_state == ReaderState::MATCH_ENCLOSE) {
+ _idx -= std::min(_column_sep_len - 1, _idx);
+ }
_total_len = length;
size_t bound = update_reading_bound(start);
@@ -138,7 +144,6 @@ void EncloseCsvLineReaderCtx::_on_normal(const uint8_t*
start, size_t& len) {
_state.forward_to(ReaderState::START);
return;
}
- // TODO(tsy): maybe potential bug when a multi-char is not read completely
_idx = len;
}
diff --git
a/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
b/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
new file mode 100644
index 00000000000..fd79d197413
Binary files /dev/null and
b/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
differ
diff --git
a/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
b/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
index 3404ee00ebf..18ad24a7652 100644
Binary files
a/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
and
b/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
differ
diff --git
a/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
b/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
index 509f2c98f60..75bbd332b76 100644
---
a/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
+++
b/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
@@ -44,4 +44,58 @@ suite("test_stream_load_big_file_with_special_delimiter",
"p1") {
sql "sync"
qt_sql "select count(*) from ${tableName}"
+
+ tableName = "test_csv_big_file_truncate_delimiter";
+ sql """ DROP TABLE IF EXISTS ${tableName} """
+ sql """
+ CREATE TABLE ${tableName} (
+ `measureid` VARCHAR(500) NOT NULL,
+ `measuretag` VARCHAR(500) NOT NULL,
+ `timestamp` VARCHAR(500) NOT NULL,
+ `ds` VARCHAR(255) NULL,
+ `hh` VARCHAR(255) NULL,
+ `meter_id` VARCHAR(500) NULL,
+ `maintenance_team` VARCHAR(1000) NULL,
+ `psr_class_name` VARCHAR(500) NULL,
+ `inst_id` VARCHAR(500) NULL,
+ `location_type` VARCHAR(500) NULL,
+ `name` VARCHAR(500) NULL,
+ `depart` VARCHAR(500) NULL,
+ `measurepoint_id` VARCHAR(500) NULL,
+ `district` VARCHAR(500) NULL,
+ `enddevice_psr_class_name` VARCHAR(500) NULL,
+ `enddevice_psr_id` VARCHAR(500) NULL,
+ `root_id` VARCHAR(500) NULL,
+ `rt` VARCHAR(500) NULL,
+ `measurevalue` VARCHAR(500) NULL,
+ `dataquality` VARCHAR(500) NULL,
+ `datatablename` VARCHAR(500) NULL,
+ `tag` VARCHAR(500) NULL,
+ `equip_src_id` VARCHAR(500) NULL,
+ `root_class_name` VARCHAR(500) NULL,
+ `ssid` VARCHAR(500) NULL,
+ `sysdate_uep` VARCHAR(500) NULL
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`measureid`, `measuretag`, `timestamp`, `ds`)
+ AUTO PARTITION BY LIST (`ds`)(
+ )
+ DISTRIBUTED BY HASH(`measureid`) BUCKETS 10
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+ streamLoad {
+ table "${tableName}"
+
+ set 'column_separator', '@@@'
+ set 'columns',
'hh,ds,meter_id,maintenance_team,measureid,psr_class_name,inst_id,location_type,name,depart,measurepoint_id,district,enddevice_psr_class_name,enddevice_psr_id,root_id,measuretag,rt,measurevalue,timestamp,dataquality,datatablename,tag,equip_src_id,root_class_name,ssid,sysdate_uep'
+ set 'enclose', '`'
+ set 'format', "CSV"
+ set 'compress_type', 'GZ'
+
+ file 'test_csv_big_file_truncate_delimiter.csv.gz'
+ }
+
+ sql "sync"
+ qt_sql "select count(*) from ${tableName}"
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]