This is an automated email from the ASF dual-hosted git repository.
liaoxin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 016fd06046d [fix](csv reader) fix incorrect column parsing when using
enclose for CSV files with UTF-8 BOM (#60864)
016fd06046d is described below
commit 016fd06046df13c24a906e626e9b389abfa7c8a3
Author: hui lai <[email protected]>
AuthorDate: Tue Mar 3 11:26:50 2026 +0800
[fix](csv reader) fix incorrect column parsing when using enclose for CSV
files with UTF-8 BOM (#60864)
## Background
When reading CSV files with UTF-8 BOM (Byte Order Mark) and `enclose`
character enabled
(e.g., `enclose = '"'`), the column names and data values are parsed
incorrectly.
## Root Cause
In enclose mode, `EncloseCsvLineReaderCtx` pre-computes
`column_sep_positions` (absolute
byte offsets of column separators) during `read_line()`. These positions
are calculated on
the raw line data **including the 3-byte BOM** (`0xEF 0xBB 0xBF`).
Later, `CsvReader::_remove_bom()` shifts the data pointer forward by 3
bytes, but the
pre-computed `column_sep_positions` are not adjusted accordingly. When
`EncloseCsvTextFieldSplitter::do_split()` uses these stale positions on
the shifted pointer,
all field boundaries are off by 3 bytes, resulting in corrupted column
names and data.
This bug does **not** affect the non-enclose mode, because
`PlainCsvTextFieldSplitter`
scans the data on-the-fly rather than relying on pre-computed positions.
## Fix
- Add `adjust_column_sep_positions(size_t offset)` to
`EncloseCsvLineReaderCtx` to subtract
the given offset from all pre-computed separator positions.
- Store the `EncloseCsvLineReaderCtx` reference in `CsvReader` when
enclose mode is active.
- Call the adjustment in `_remove_bom()` when BOM is detected, so all
call sites
(`_parse_col_names`, `_parse_col_nums`, `get_next_block`) are
automatically fixed.
---
be/src/vec/exec/format/csv/csv_reader.cpp | 18 ++++++++++++------
be/src/vec/exec/format/csv/csv_reader.h | 2 ++
.../format/file_reader/new_plain_text_line_reader.h | 9 +++++++++
.../data/load_p0/stream_load/enclose_with_bom.csv | 2 ++
.../stream_load/test_csv_with_enclose_and_escape.out | 2 ++
.../test_csv_with_enclose_and_escape.groovy | 10 ++++++++++
6 files changed, 37 insertions(+), 6 deletions(-)
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 4711f62a203..57cc2a66dda 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -593,14 +593,13 @@ Status CsvReader::_create_line_reader() {
} else {
// in load task, the _file_slot_descs is empty vector, so we need to
set col_sep_num to 0
size_t col_sep_num = _file_slot_descs.size() > 1 ?
_file_slot_descs.size() - 1 : 0;
- text_line_reader_ctx = std::make_shared<EncloseCsvLineReaderCtx>(
+ _enclose_reader_ctx = std::make_shared<EncloseCsvLineReaderCtx>(
_line_delimiter, _line_delimiter_length, _value_separator,
_value_separator_length,
col_sep_num, _enclose, _escape, _keep_cr);
+ text_line_reader_ctx = _enclose_reader_ctx;
_fields_splitter = std::make_unique<EncloseCsvTextFieldSplitter>(
- _trim_tailing_spaces, true,
-
std::static_pointer_cast<EncloseCsvLineReaderCtx>(text_line_reader_ctx),
- _value_separator_length, _enclose);
+ _trim_tailing_spaces, true, _enclose_reader_ctx,
_value_separator_length, _enclose);
}
switch (_file_format_type) {
case TFileFormatType::FORMAT_CSV_PLAIN:
@@ -820,8 +819,15 @@ Status CsvReader::_parse_col_types(size_t col_nums,
std::vector<DataTypePtr>* co
const uint8_t* CsvReader::_remove_bom(const uint8_t* ptr, size_t& size) {
if (size >= 3 && ptr[0] == 0xEF && ptr[1] == 0xBB && ptr[2] == 0xBF) {
LOG(INFO) << "remove bom";
- size -= 3;
- return ptr + 3;
+ constexpr size_t bom_size = 3;
+ size -= bom_size;
+ // In enclose mode, column_sep_positions were computed on the original
line
+ // (including BOM). After shifting the pointer, we must adjust those
positions
+ // so they remain correct relative to the new start.
+ if (_enclose_reader_ctx) {
+ _enclose_reader_ctx->adjust_column_sep_positions(bom_size);
+ }
+ return ptr + bom_size;
}
return ptr;
}
diff --git a/be/src/vec/exec/format/csv/csv_reader.h
b/be/src/vec/exec/format/csv/csv_reader.h
index e452b8a7af2..e5600314319 100644
--- a/be/src/vec/exec/format/csv/csv_reader.h
+++ b/be/src/vec/exec/format/csv/csv_reader.h
@@ -280,6 +280,8 @@ private:
io::IOContext* _io_ctx = nullptr;
std::shared_ptr<io::IOContext> _io_ctx_holder;
+ // Stored to adjust column_sep_positions when BOM is removed in enclose
mode
+ std::shared_ptr<EncloseCsvLineReaderCtx> _enclose_reader_ctx;
// save source text which have been splitted.
std::vector<Slice> _split_values;
std::vector<int> _use_nullable_string_opt;
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
index 0d02003eee1..abe3c704278 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
@@ -184,6 +184,15 @@ public:
return _column_sep_positions;
}
+ // Adjust column separator positions by subtracting the given offset.
+ // Used when BOM bytes are removed from the beginning of a line,
+ // shifting all positions by the BOM size.
+ void adjust_column_sep_positions(size_t offset) {
+ for (auto& pos : _column_sep_positions) {
+ pos -= offset;
+ }
+ }
+
const uint8_t* read_line_impl(const uint8_t* start, size_t length);
private:
diff --git a/regression-test/data/load_p0/stream_load/enclose_with_bom.csv
b/regression-test/data/load_p0/stream_load/enclose_with_bom.csv
new file mode 100644
index 00000000000..44aff1e8ac7
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/enclose_with_bom.csv
@@ -0,0 +1,2 @@
+"14","bom_test,data",2023-08-01,"hello,world","2023-08-01
12:00:00","bom,value"
+"15","normal",2023-08-02,"test","2023-08-02 13:00:00","data"
diff --git
a/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
b/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
index 7bc642d024a..46f59f7ffec 100644
---
a/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
+++
b/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
@@ -15,4 +15,6 @@
11 abc,def 2023-07-15 ghi 2023-07-20T05:48:31 jkl\nmne
12 {"a": 1} 2023-07-15 def 2023-07-20T05:48:31 {"a": 1}
13 {"a": 2} 2023-07-15 def 2023-07-20T05:48:31 {"a": 2}
+14 bom_test,data 2023-08-01 hello,world 2023-08-01T12:00
bom,value
+15 normal 2023-08-02 test 2023-08-02T13:00 data
diff --git
a/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
b/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
index e8f5e08ec20..309b78827a4 100644
---
a/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
+++
b/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
@@ -115,6 +115,16 @@ suite("test_csv_with_enclose_and_escape", "p0") {
file "enclose_with_same_escape.csv"
}
+ // test CSV file with UTF-8 BOM and enclose
+ streamLoad {
+ table "${tableName}"
+ set 'column_separator', ','
+ set 'enclose', "\""
+ set 'escape', '\\'
+
+ file "enclose_with_bom.csv"
+ }
+
sql "sync"
qt_select """
SELECT * FROM ${tableName} ORDER BY k1, k2
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]