(doris) branch branch-3.1 updated: branch-3.1: [fix](csv reader) fix csv parse error when use enclose with multi-char column separator #54581 (#54764)

morrysnow Fri, 15 Aug 2025 02:42:26 -0700

This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new f764788d8bb branch-3.1: [fix](csv reader) fix csv parse error when use 
enclose with multi-char column separator #54581 (#54764)
f764788d8bb is described below

commit f764788d8bb364bc6b5bcdb567f47ff3a504d78b
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri Aug 15 17:40:42 2025 +0800

    branch-3.1: [fix](csv reader) fix csv parse error when use enclose with 
multi-char column separator #54581 (#54764)
    
    Cherry-picked from #54581
    
    Co-authored-by: hui lai <[email protected]>
---
 .../file_reader/new_plain_text_line_reader.cpp     |   7 ++-
 .../test_csv_big_file_truncate_delimiter.csv.gz    | Bin 0 -> 850496 bytes
 ...stream_load_big_file_with_special_delimiter.out | Bin 113 -> 132 bytes
 ...eam_load_big_file_with_special_delimiter.groovy |  54 +++++++++++++++++++++
 4 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp 
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
index a068a748b11..80534fb9471 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
@@ -47,6 +47,12 @@
 namespace doris {
 #include "common/compile_check_begin.h"
 const uint8_t* EncloseCsvLineReaderCtx::read_line_impl(const uint8_t* start, 
const size_t length) {
+    // Avoid part bytes of the multi-char column separator have already been 
parsed,
+    // causing parse column separator error.
+    if (_state.curr_state == ReaderState::NORMAL ||
+        _state.curr_state == ReaderState::MATCH_ENCLOSE) {
+        _idx -= std::min(_column_sep_len - 1, _idx);
+    }
     _total_len = length;
     size_t bound = update_reading_bound(start);
 
@@ -138,7 +144,6 @@ void EncloseCsvLineReaderCtx::_on_normal(const uint8_t* 
start, size_t& len) {
         _state.forward_to(ReaderState::START);
         return;
     }
-    // TODO(tsy): maybe potential bug when a multi-char is not read completely
     _idx = len;
 }
 
diff --git 
a/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
 
b/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
new file mode 100644
index 00000000000..fd79d197413
Binary files /dev/null and 
b/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
 differ
diff --git 
a/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
 
b/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
index 3404ee00ebf..18ad24a7652 100644
Binary files 
a/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
 and 
b/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
 differ
diff --git 
a/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
 
b/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
index 509f2c98f60..75bbd332b76 100644
--- 
a/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
+++ 
b/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
@@ -44,4 +44,58 @@ suite("test_stream_load_big_file_with_special_delimiter", 
"p1") {
 
     sql "sync"
     qt_sql "select count(*) from ${tableName}"
+
+    tableName = "test_csv_big_file_truncate_delimiter";
+    sql """ DROP TABLE IF EXISTS ${tableName} """
+    sql """
+        CREATE TABLE ${tableName} (
+            `measureid` VARCHAR(500) NOT NULL,
+            `measuretag` VARCHAR(500) NOT NULL,
+            `timestamp` VARCHAR(500) NOT NULL,
+            `ds` VARCHAR(255) NULL,
+            `hh` VARCHAR(255) NULL,
+            `meter_id` VARCHAR(500) NULL,
+            `maintenance_team` VARCHAR(1000) NULL,
+            `psr_class_name` VARCHAR(500) NULL,
+            `inst_id` VARCHAR(500) NULL,
+            `location_type` VARCHAR(500) NULL,
+            `name` VARCHAR(500) NULL,
+            `depart` VARCHAR(500) NULL,
+            `measurepoint_id` VARCHAR(500) NULL,
+            `district` VARCHAR(500) NULL,
+            `enddevice_psr_class_name` VARCHAR(500) NULL,
+            `enddevice_psr_id` VARCHAR(500) NULL,
+            `root_id` VARCHAR(500) NULL,
+            `rt` VARCHAR(500) NULL,
+            `measurevalue` VARCHAR(500) NULL,
+            `dataquality` VARCHAR(500) NULL,
+            `datatablename` VARCHAR(500) NULL,
+            `tag` VARCHAR(500) NULL,
+            `equip_src_id` VARCHAR(500) NULL,
+            `root_class_name` VARCHAR(500) NULL,
+            `ssid` VARCHAR(500) NULL,
+            `sysdate_uep` VARCHAR(500) NULL
+        ) ENGINE=OLAP
+          DUPLICATE KEY(`measureid`, `measuretag`, `timestamp`, `ds`)
+          AUTO PARTITION BY LIST (`ds`)(
+          )
+          DISTRIBUTED BY HASH(`measureid`) BUCKETS 10
+          PROPERTIES (
+              "replication_allocation" = "tag.location.default: 1"
+          );
+    """
+    streamLoad {
+        table "${tableName}"
+
+        set 'column_separator', '@@@'
+        set 'columns', 
'hh,ds,meter_id,maintenance_team,measureid,psr_class_name,inst_id,location_type,name,depart,measurepoint_id,district,enddevice_psr_class_name,enddevice_psr_id,root_id,measuretag,rt,measurevalue,timestamp,dataquality,datatablename,tag,equip_src_id,root_class_name,ssid,sysdate_uep'
+        set 'enclose', '`'
+        set 'format', "CSV"
+        set 'compress_type', 'GZ'
+
+        file 'test_csv_big_file_truncate_delimiter.csv.gz'
+    }
+
+    sql "sync"
+    qt_sql "select count(*) from ${tableName}"
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-3.1 updated: branch-3.1: [fix](csv reader) fix csv parse error when use enclose with multi-char column separator #54581 (#54764)

Reply via email to