This is an automated email from the ASF dual-hosted git repository.

liaoxin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 016fd06046d [fix](csv reader) fix incorrect column parsing when using 
enclose for CSV files with UTF-8 BOM (#60864)
016fd06046d is described below

commit 016fd06046df13c24a906e626e9b389abfa7c8a3
Author: hui lai <[email protected]>
AuthorDate: Tue Mar 3 11:26:50 2026 +0800

    [fix](csv reader) fix incorrect column parsing when using enclose for CSV 
files with UTF-8 BOM (#60864)
    
    ## Background
    
    When reading CSV files with UTF-8 BOM (Byte Order Mark) and `enclose`
    character enabled
    (e.g., `enclose = '"'`), the column names and data values are parsed
    incorrectly.
    
    ## Root Cause
    
    In enclose mode, `EncloseCsvLineReaderCtx` pre-computes
    `column_sep_positions` (absolute
    byte offsets of column separators) during `read_line()`. These positions
    are calculated on
    the raw line data **including the 3-byte BOM** (`0xEF 0xBB 0xBF`).
    
    Later, `CsvReader::_remove_bom()` shifts the data pointer forward by 3
    bytes, but the
    pre-computed `column_sep_positions` are not adjusted accordingly. When
    `EncloseCsvTextFieldSplitter::do_split()` uses these stale positions on
    the shifted pointer,
    all field boundaries are off by 3 bytes, resulting in corrupted column
    names and data.
    
    This bug does **not** affect the non-enclose mode, because
    `PlainCsvTextFieldSplitter`
    scans the data on-the-fly rather than relying on pre-computed positions.
    
    ## Fix
    
    - Add `adjust_column_sep_positions(size_t offset)` to
    `EncloseCsvLineReaderCtx` to subtract
      the given offset from all pre-computed separator positions.
    - Store the `EncloseCsvLineReaderCtx` reference in `CsvReader` when
    enclose mode is active.
    - Call the adjustment in `_remove_bom()` when BOM is detected, so all
    call sites
    (`_parse_col_names`, `_parse_col_nums`, `get_next_block`) are
    automatically fixed.
---
 be/src/vec/exec/format/csv/csv_reader.cpp              | 18 ++++++++++++------
 be/src/vec/exec/format/csv/csv_reader.h                |  2 ++
 .../format/file_reader/new_plain_text_line_reader.h    |  9 +++++++++
 .../data/load_p0/stream_load/enclose_with_bom.csv      |  2 ++
 .../stream_load/test_csv_with_enclose_and_escape.out   |  2 ++
 .../test_csv_with_enclose_and_escape.groovy            | 10 ++++++++++
 6 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 4711f62a203..57cc2a66dda 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -593,14 +593,13 @@ Status CsvReader::_create_line_reader() {
     } else {
         // in load task, the _file_slot_descs is empty vector, so we need to 
set col_sep_num to 0
         size_t col_sep_num = _file_slot_descs.size() > 1 ? 
_file_slot_descs.size() - 1 : 0;
-        text_line_reader_ctx = std::make_shared<EncloseCsvLineReaderCtx>(
+        _enclose_reader_ctx = std::make_shared<EncloseCsvLineReaderCtx>(
                 _line_delimiter, _line_delimiter_length, _value_separator, 
_value_separator_length,
                 col_sep_num, _enclose, _escape, _keep_cr);
+        text_line_reader_ctx = _enclose_reader_ctx;
 
         _fields_splitter = std::make_unique<EncloseCsvTextFieldSplitter>(
-                _trim_tailing_spaces, true,
-                
std::static_pointer_cast<EncloseCsvLineReaderCtx>(text_line_reader_ctx),
-                _value_separator_length, _enclose);
+                _trim_tailing_spaces, true, _enclose_reader_ctx, 
_value_separator_length, _enclose);
     }
     switch (_file_format_type) {
     case TFileFormatType::FORMAT_CSV_PLAIN:
@@ -820,8 +819,15 @@ Status CsvReader::_parse_col_types(size_t col_nums, 
std::vector<DataTypePtr>* co
 const uint8_t* CsvReader::_remove_bom(const uint8_t* ptr, size_t& size) {
     if (size >= 3 && ptr[0] == 0xEF && ptr[1] == 0xBB && ptr[2] == 0xBF) {
         LOG(INFO) << "remove bom";
-        size -= 3;
-        return ptr + 3;
+        constexpr size_t bom_size = 3;
+        size -= bom_size;
+        // In enclose mode, column_sep_positions were computed on the original 
line
+        // (including BOM). After shifting the pointer, we must adjust those 
positions
+        // so they remain correct relative to the new start.
+        if (_enclose_reader_ctx) {
+            _enclose_reader_ctx->adjust_column_sep_positions(bom_size);
+        }
+        return ptr + bom_size;
     }
     return ptr;
 }
diff --git a/be/src/vec/exec/format/csv/csv_reader.h 
b/be/src/vec/exec/format/csv/csv_reader.h
index e452b8a7af2..e5600314319 100644
--- a/be/src/vec/exec/format/csv/csv_reader.h
+++ b/be/src/vec/exec/format/csv/csv_reader.h
@@ -280,6 +280,8 @@ private:
 
     io::IOContext* _io_ctx = nullptr;
     std::shared_ptr<io::IOContext> _io_ctx_holder;
+    // Stored to adjust column_sep_positions when BOM is removed in enclose 
mode
+    std::shared_ptr<EncloseCsvLineReaderCtx> _enclose_reader_ctx;
     // save source text which have been splitted.
     std::vector<Slice> _split_values;
     std::vector<int> _use_nullable_string_opt;
diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h 
b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
index 0d02003eee1..abe3c704278 100644
--- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
+++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h
@@ -184,6 +184,15 @@ public:
         return _column_sep_positions;
     }
 
+    // Adjust column separator positions by subtracting the given offset.
+    // Used when BOM bytes are removed from the beginning of a line,
+    // shifting all positions by the BOM size.
+    void adjust_column_sep_positions(size_t offset) {
+        for (auto& pos : _column_sep_positions) {
+            pos -= offset;
+        }
+    }
+
     const uint8_t* read_line_impl(const uint8_t* start, size_t length);
 
 private:
diff --git a/regression-test/data/load_p0/stream_load/enclose_with_bom.csv 
b/regression-test/data/load_p0/stream_load/enclose_with_bom.csv
new file mode 100644
index 00000000000..44aff1e8ac7
--- /dev/null
+++ b/regression-test/data/load_p0/stream_load/enclose_with_bom.csv
@@ -0,0 +1,2 @@
+"14","bom_test,data",2023-08-01,"hello,world","2023-08-01 
12:00:00","bom,value"
+"15","normal",2023-08-02,"test","2023-08-02 13:00:00","data"
diff --git 
a/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out 
b/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
index 7bc642d024a..46f59f7ffec 100644
--- 
a/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
+++ 
b/regression-test/data/load_p0/stream_load/test_csv_with_enclose_and_escape.out
@@ -15,4 +15,6 @@
 11     abc,def 2023-07-15      ghi     2023-07-20T05:48:31     jkl\nmne
 12     {"a": 1}        2023-07-15      def     2023-07-20T05:48:31     {"a": 1}
 13     {"a": 2}        2023-07-15      def     2023-07-20T05:48:31     {"a": 2}
+14     bom_test,data   2023-08-01      hello,world     2023-08-01T12:00        
bom,value
+15     normal  2023-08-02      test    2023-08-02T13:00        data
 
diff --git 
a/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
 
b/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
index e8f5e08ec20..309b78827a4 100644
--- 
a/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
+++ 
b/regression-test/suites/load_p0/stream_load/test_csv_with_enclose_and_escape.groovy
@@ -115,6 +115,16 @@ suite("test_csv_with_enclose_and_escape", "p0") {
         file "enclose_with_same_escape.csv"
     }
 
+    // test CSV file with UTF-8 BOM and enclose
+    streamLoad {
+        table "${tableName}"
+        set 'column_separator', ','
+        set 'enclose', "\""
+        set 'escape', '\\'
+
+        file "enclose_with_bom.csv"
+    }
+
     sql "sync"
     qt_select """
         SELECT * FROM ${tableName} ORDER BY k1, k2 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to