This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new df7194e4962 branch-3.0: [fix](csv reader) fix csv parse error when use
enclose with multi-char column separator (#54581) (#55052)
df7194e4962 is described below
commit df7194e496217c64df7c61892204ce77d8c83667
Author: hui lai <[email protected]>
AuthorDate: Fri Aug 22 21:45:20 2025 +0800
branch-3.0: [fix](csv reader) fix csv parse error when use enclose with
multi-char column separator (#54581) (#55052)
pick #54581
Idx represents the position where the buffer is parsed.
If the buffer does not read a complete row, as shown in the following
figure, idx will become the length of the buffer, and then the buffer
will be expanded. If some of the column separators happen to be at the
end of the buffer and some are not read, when reading after expansion,
it will be impossible to read the complete column separators, resulting
in parsing errors.
---
.../file_reader/new_plain_text_line_reader.cpp | 7 ++-
.../test_csv_big_file_truncate_delimiter.csv.gz | Bin 0 -> 850496 bytes
...stream_load_big_file_with_special_delimiter.out | Bin 113 -> 132 bytes
...eam_load_big_file_with_special_delimiter.groovy | 54 +++++++++++++++++++++
4 files changed, 60 insertions(+), 1 deletion(-)
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
index 94eaf2edadb..08000d7df5b 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp
@@ -49,6 +49,12 @@
namespace doris {
const uint8_t* EncloseCsvLineReaderContext::read_line_impl(const uint8_t*
start,
const size_t
length) {
+ // Avoid part bytes of the multi-char column separator have already been
parsed,
+ // causing parse column separator error.
+ if (_state.curr_state == ReaderState::NORMAL ||
+ _state.curr_state == ReaderState::MATCH_ENCLOSE) {
+ _idx -= std::min(_column_sep_len - 1, _idx);
+ }
_total_len = length;
size_t bound = update_reading_bound(start);
@@ -141,7 +147,6 @@ void EncloseCsvLineReaderContext::_on_normal(const uint8_t*
start, size_t& len)
_state.forward_to(ReaderState::START);
return;
}
- // TODO(tsy): maybe potential bug when a multi-char is not read completely
_idx = len;
}
diff --git
a/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
b/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
new file mode 100644
index 00000000000..fd79d197413
Binary files /dev/null and
b/regression-test/data/load_p1/stream_load/test_csv_big_file_truncate_delimiter.csv.gz
differ
diff --git
a/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
b/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
index 3404ee00ebf..18ad24a7652 100644
Binary files
a/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
and
b/regression-test/data/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.out
differ
diff --git
a/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
b/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
index 509f2c98f60..75bbd332b76 100644
---
a/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
+++
b/regression-test/suites/load_p1/stream_load/test_stream_load_big_file_with_special_delimiter.groovy
@@ -44,4 +44,58 @@ suite("test_stream_load_big_file_with_special_delimiter",
"p1") {
sql "sync"
qt_sql "select count(*) from ${tableName}"
+
+ tableName = "test_csv_big_file_truncate_delimiter";
+ sql """ DROP TABLE IF EXISTS ${tableName} """
+ sql """
+ CREATE TABLE ${tableName} (
+ `measureid` VARCHAR(500) NOT NULL,
+ `measuretag` VARCHAR(500) NOT NULL,
+ `timestamp` VARCHAR(500) NOT NULL,
+ `ds` VARCHAR(255) NULL,
+ `hh` VARCHAR(255) NULL,
+ `meter_id` VARCHAR(500) NULL,
+ `maintenance_team` VARCHAR(1000) NULL,
+ `psr_class_name` VARCHAR(500) NULL,
+ `inst_id` VARCHAR(500) NULL,
+ `location_type` VARCHAR(500) NULL,
+ `name` VARCHAR(500) NULL,
+ `depart` VARCHAR(500) NULL,
+ `measurepoint_id` VARCHAR(500) NULL,
+ `district` VARCHAR(500) NULL,
+ `enddevice_psr_class_name` VARCHAR(500) NULL,
+ `enddevice_psr_id` VARCHAR(500) NULL,
+ `root_id` VARCHAR(500) NULL,
+ `rt` VARCHAR(500) NULL,
+ `measurevalue` VARCHAR(500) NULL,
+ `dataquality` VARCHAR(500) NULL,
+ `datatablename` VARCHAR(500) NULL,
+ `tag` VARCHAR(500) NULL,
+ `equip_src_id` VARCHAR(500) NULL,
+ `root_class_name` VARCHAR(500) NULL,
+ `ssid` VARCHAR(500) NULL,
+ `sysdate_uep` VARCHAR(500) NULL
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`measureid`, `measuretag`, `timestamp`, `ds`)
+ AUTO PARTITION BY LIST (`ds`)(
+ )
+ DISTRIBUTED BY HASH(`measureid`) BUCKETS 10
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+ streamLoad {
+ table "${tableName}"
+
+ set 'column_separator', '@@@'
+ set 'columns',
'hh,ds,meter_id,maintenance_team,measureid,psr_class_name,inst_id,location_type,name,depart,measurepoint_id,district,enddevice_psr_class_name,enddevice_psr_id,root_id,measuretag,rt,measurevalue,timestamp,dataquality,datatablename,tag,equip_src_id,root_class_name,ssid,sysdate_uep'
+ set 'enclose', '`'
+ set 'format', "CSV"
+ set 'compress_type', 'GZ'
+
+ file 'test_csv_big_file_truncate_delimiter.csv.gz'
+ }
+
+ sql "sync"
+ qt_sql "select count(*) from ${tableName}"
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]