(doris) branch branch-3.0 updated: branch-3.0: [fix](csv reader) fix csv parse error when use enclose with multi-char column separator (#54581) (#55052)

dataroaring Fri, 22 Aug 2025 06:47:04 -0700

This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new df7194e4962 branch-3.0: [fix](csv reader) fix csv parse error when use 
enclose with multi-char column separator (#54581) (#55052)
df7194e4962 is described below

commit df7194e496217c64df7c61892204ce77d8c83667
Author: hui lai <[email protected]>
AuthorDate: Fri Aug 22 21:45:20 2025 +0800

    branch-3.0: [fix](csv reader) fix csv parse error when use enclose with 
multi-char column separator (#54581) (#55052)
    
    pick #54581
    
    Idx represents the position where the buffer is parsed.
    
    If the buffer does not read a complete row, as shown in the following
    figure, idx will become the length of the buffer, and then the buffer
    will be expanded. If some of the column separators happen to be at the
    end of the buffer and some are not read, when reading after expansion,
    it will be impossible to read the complete column separators, resulting
    in parsing errors.
---
 .../file_reader/new_plain_text_line_reader.cpp     |   7 ++-
 .../test_csv_big_file_truncate_delimiter.csv.gz    | Bin 0 -> 850496 bytes
 ...stream_load_big_file_with_special_delimiter.out | Bin 113 -> 132 bytes
 ...eam_load_big_file_with_special_delimiter.groovy |  54 +++++++++++++++++++++
 4 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp 
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
index 94eaf2edadb..08000d7df5b 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
@@ -49,6 +49,12 @@
 namespace doris {
 const uint8_t* EncloseCsvLineReaderContext::read_line_impl(const uint8_t* 
start,
                                                            const size_t 
length) {
+    // Avoid part bytes of the multi-char column separator have already been 
parsed,
+    // causing parse column separator error.
+    if (_state.curr_state == ReaderState::NORMAL ||
+        _state.curr_state == ReaderState::MATCH_ENCLOSE) {
+        _idx -= std::min(_column_sep_len - 1, _idx);
+    }
     _total_len = length;
     size_t bound = update_reading_bound(start);
 
@@ -141,7 +147,6 @@ void EncloseCsvLineReaderContext::_on_normal(const uint8_t* 
start, size_t& len)
         _state.forward_to(ReaderState::START);
         return;
     }
-    // TODO(tsy): maybe potential bug when a multi-char is not read completely
     _idx = len;
 }
 
diff --git 
a/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
 
b/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
new file mode 100644
index 00000000000..fd79d197413
Binary files /dev/null and 
b/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
 differ
diff --git 
a/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
 
b/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
index 3404ee00ebf..18ad24a7652 100644
Binary files 
a/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
 and 
b/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
 differ
diff --git 
a/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
 
b/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
index 509f2c98f60..75bbd332b76 100644
--- 
a/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
+++ 
b/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
@@ -44,4 +44,58 @@ suite("test_stream_load_big_file_with_special_delimiter", 
"p1") {
 
     sql "sync"
     qt_sql "select count(*) from ${tableName}"
+
+    tableName = "test_csv_big_file_truncate_delimiter";
+    sql """ DROP TABLE IF EXISTS ${tableName} """
+    sql """
+        CREATE TABLE ${tableName} (
+            `measureid` VARCHAR(500) NOT NULL,
+            `measuretag` VARCHAR(500) NOT NULL,
+            `timestamp` VARCHAR(500) NOT NULL,
+            `ds` VARCHAR(255) NULL,
+            `hh` VARCHAR(255) NULL,
+            `meter_id` VARCHAR(500) NULL,
+            `maintenance_team` VARCHAR(1000) NULL,
+            `psr_class_name` VARCHAR(500) NULL,
+            `inst_id` VARCHAR(500) NULL,
+            `location_type` VARCHAR(500) NULL,
+            `name` VARCHAR(500) NULL,
+            `depart` VARCHAR(500) NULL,
+            `measurepoint_id` VARCHAR(500) NULL,
+            `district` VARCHAR(500) NULL,
+            `enddevice_psr_class_name` VARCHAR(500) NULL,
+            `enddevice_psr_id` VARCHAR(500) NULL,
+            `root_id` VARCHAR(500) NULL,
+            `rt` VARCHAR(500) NULL,
+            `measurevalue` VARCHAR(500) NULL,
+            `dataquality` VARCHAR(500) NULL,
+            `datatablename` VARCHAR(500) NULL,
+            `tag` VARCHAR(500) NULL,
+            `equip_src_id` VARCHAR(500) NULL,
+            `root_class_name` VARCHAR(500) NULL,
+            `ssid` VARCHAR(500) NULL,
+            `sysdate_uep` VARCHAR(500) NULL
+        ) ENGINE=OLAP
+          DUPLICATE KEY(`measureid`, `measuretag`, `timestamp`, `ds`)
+          AUTO PARTITION BY LIST (`ds`)(
+          )
+          DISTRIBUTED BY HASH(`measureid`) BUCKETS 10
+          PROPERTIES (
+              "replication_allocation" = "tag.location.default: 1"
+          );
+    """
+    streamLoad {
+        table "${tableName}"
+
+        set 'column_separator', '@@@'
+        set 'columns', 
'hh,ds,meter_id,maintenance_team,measureid,psr_class_name,inst_id,location_type,name,depart,measurepoint_id,district,enddevice_psr_class_name,enddevice_psr_id,root_id,measuretag,rt,measurevalue,timestamp,dataquality,datatablename,tag,equip_src_id,root_class_name,ssid,sysdate_uep'
+        set 'enclose', '`'
+        set 'format', "CSV"
+        set 'compress_type', 'GZ'
+
+        file 'test_csv_big_file_truncate_delimiter.csv.gz'
+    }
+
+    sql "sync"
+    qt_sql "select count(*) from ${tableName}"
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-3.0 updated: branch-3.0: [fix](csv reader) fix csv parse error when use enclose with multi-char column separator (#54581) (#55052)

Reply via email to