This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 2dda44d7b5 [fix](csv-reader)fix bug of multi-char delimiter in csv reader 2dda44d7b5 is described below commit 2dda44d7b53a825f864ab9ec0743e18a113d3f68 Author: daidai <2017501...@qq.com> AuthorDate: Wed Aug 23 15:19:13 2023 +0800 [fix](csv-reader)fix bug of multi-char delimiter in csv reader fix bug that csv_reader parse line in order to get column. --- be/src/vec/exec/format/csv/csv_reader.cpp | 69 ++++++----- .../load_p0/stream_load/test_csv_split_line.out | 14 +++ .../load_p0/stream_load/test_csv_split_line1.csv | 1 + .../load_p0/stream_load/test_csv_split_line2.csv | 4 + .../load_p0/stream_load/test_csv_split_line3.csv | 4 + .../load_p0/stream_load/test_csv_split_line.groovy | 130 +++++++++++++++++++++ 6 files changed, 194 insertions(+), 28 deletions(-) diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index aed7864dd8..ba5d69cb73 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -100,38 +100,51 @@ void PlainCsvTextFieldSplitter::_split_field_single_char(const Slice& line, void PlainCsvTextFieldSplitter::_split_field_multi_char(const Slice& line, std::vector<Slice>* splitted_values) { - const char* data = line.data; size_t start = 0; // point to the start pos of next col value. size_t curpos = 0; // point to the start pos of separator matching sequence. - size_t p1 = 0; // point to the current pos of separator matching sequence. - - // Separator: AAAA - // - // p1 - // ▼ - // AAAA - // 1000AAAA2000AAAA - // ▲ ▲ - // Start │ - // curpos - while (curpos < line.size) { - if (curpos + p1 == line.size || *(data + curpos + p1) != _value_sep[p1]) { - // Not match, move forward: - curpos += (p1 == 0 ? 1 : p1); - p1 = 0; - } else { - p1++; - if (p1 == value_sep_len) { - // Match a separator - process_value_func(data, start, curpos - start, trimming_char, splitted_values); - start = curpos + value_sep_len; - curpos = start; - p1 = 0; - } + + // value_sep : AAAA + // line.data : 1234AAAA5678 + // -> 1234,5678 + + // start start + // ▼ ▼ + // 1234AAAA5678\0 + // ▲ ▲ + // curpos curpos + + //kmp + vector<int> next(value_sep_len); + next[0] = -1; + for (int i = 1, j = -1; i < value_sep_len; i++) { + while (j > -1 && _value_sep[i] != _value_sep[j + 1]) { + j = next[j]; + } + if (_value_sep[i] == _value_sep[j + 1]) { + j++; + } + next[i] = j; + } + + for (int i = 0, j = -1; i < line.size; i++) { + // i : line + // j : _value_sep + while (j > -1 && line[i] != _value_sep[j + 1]) { + j = next[j]; + } + if (line[i] == _value_sep[j + 1]) { + j++; + } + if (j == value_sep_len - 1) { + curpos = i - value_sep_len + 1; + + process_value_func(line.data, start, curpos - start, trimming_char, splitted_values); + + start = i + 1; + j = next[j]; } } - CHECK(curpos == line.size) << curpos << " vs " << line.size; - process_value_func(data, start, curpos - start, trimming_char, splitted_values); + process_value_func(line.data, start, line.size - start, trimming_char, splitted_values); } void PlainCsvTextFieldSplitter::do_split(const Slice& line, std::vector<Slice>* splitted_values) { diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line.out b/regression-test/data/load_p0/stream_load/test_csv_split_line.out new file mode 100644 index 0000000000..fe62ba6f12 --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_csv_split_line.out @@ -0,0 +1,14 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +000e124abc3a49b18b14424ebb6ee8b5 2715668347726333217 352b88835f0a761888314515e4de5b18 000e124abc3a49b18b14424ebb6ee8b5 1682897543355 2023-05-01 hips_product hips_combo 829aafbe9b59ae408b3fbf21d8d8fb797c7f2358 \N c:\\windows\\system32\\tasks\\lenovo\\imcontroller\\timebasedevents\\a4612416-67a7-48f1-9f87-e1e6dd7dd87e a4612416-67a7-48f1-9f87-e1e6dd7dd87e \N 0 0 \N 0 0 \N 1 10.0.19044.256.1.0 11.00.19041.1566 (WinBuild.160101.0800) fdid:563 Lenovo Lenovo.Modern.ImController Lenovo.Modern.Im [...] + +-- !sql -- +1000 worldhell 10000000 ello +2000 wohellhell 200000 ellohell +3000 worellohell 30000000 elloab +4000 hellwohellhell \N abcdeeelhllo +10001 helloword 114466 0000011445\r +55555 \N 14455 7711445777\r +66666 \N \N 113355\r +77777 0011455 8888 114545 + diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line1.csv b/regression-test/data/load_p0/stream_load/test_csv_split_line1.csv new file mode 100644 index 0000000000..7e0a6f8144 --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_csv_split_line1.csv @@ -0,0 +1 @@ +000e124abc3a49b18b14424ebb6ee8b55b18511e27156683477263332175b18511e352b88835f0a761888314515e4de5b185b18511e000e124abc3a49b18b14424ebb6ee8b55b18511e16828975433555b18511e2023-05-015b18511ehips_product5b18511ehips_combo5b18511e829aafbe9b59ae408b3fbf21d8d8fb797c7f23585b18511e\N5b18511ec:\windows\system32\tasks\lenovo\imcontroller\timebasedevents\a4612416-67a7-48f1-9f87-e1e6dd7dd87e5b18511ea4612416-67a7-48f1-9f87-e1e6dd7dd87e5b18511e\N5b18511e05b18511e05b18511e\N5b18511e05b18511e05b18511e\N5b [...] \ No newline at end of file diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv new file mode 100644 index 0000000000..04ba509ae4 --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv @@ -0,0 +1,4 @@ +1000helloworldhellhello10000000helloello +2000hellowohellhellhello200000helloellohell +3000helloworellohellhello30000000helloelloab +4000hellohellwohellhellhello\Nhelloabcdeeelhllo \ No newline at end of file diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv new file mode 100644 index 0000000000..bb6949bacf --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv @@ -0,0 +1,4 @@ +10001114455helloword1144551144661144550000011445 +55555114455\N114455144551144557711445777 +66666114455\N114455\N114455113355 +7777711445500114551144558888114455114545 \ No newline at end of file diff --git a/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy new file mode 100644 index 0000000000..b22e8bb319 --- /dev/null +++ b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_csv_split_line", "p0") { + def tableName = "test_csv_split_line" + sql """ set enable_fallback_to_original_planner=false;""" + sql """ create database if not exists demo;""" + sql """ DROP TABLE IF EXISTS ${tableName}1 """ + sql """ CREATE TABLE ${tableName}1 ( + `mid` varchar(255) NULL, + `ent_id` varchar(255) NULL, + `file_md5` varchar(255) NULL, + `m2` varchar(255) NULL, + `event_time` bigint(20) NULL, + `event_date` date NULL, + `product` varchar(255) NULL, + `combo` varchar(255) NULL, + `file_sha1` varchar(255) NULL, + `file_sha256` varchar(255) NULL, + `file_path` varchar(1000) NULL, + `file_name` varchar(1000) NULL, + `file_size` int(11) NULL, + `file_age` int(11) NULL, + `file_ispe` varchar(1000) NULL, + `file_isx64` int(11) NULL, + `file_level` int(11) NULL, + `file_sublevel` int(11) NULL, + `file_level_sublevel` varchar(255) NULL, + `client_iswin64` int(11) NULL, + `client_os_version` varchar(255) NULL, + `client_ie_version` varchar(255) NULL, + `rule_group_id` varchar(1000) NULL, + `process_sign` varchar(1000) NULL, + `process_product_name` varchar(1000) NULL, + `process_original_name` varchar(1000) NULL, + `process_internal_name` varchar(1000) NULL, + `process_pparent_path` varchar(10000) NULL, + `process_parent_path` varchar(10000) NULL, + `process_parent_command_line` varchar(60000) NULL, + `process_path` varchar(10000) NULL, + `process_command_line` varchar(10000) NULL, + `file_dna` varchar(1000) NULL, + `icon_dna` varchar(1000) NULL, + `client_ip` varchar(10000) NULL, + `assetid` varchar(255) NULL, + `product_ver` varchar(255) NULL, + `clientid` varchar(1000) NULL, + `process_file_size` int(11) NULL, + `client_id` varchar(65533) NULL, + `rule_hit_all` varchar(65533) NULL, + `__op` boolean NULL + ) ENGINE=OLAP + DUPLICATE KEY(`mid`) + DISTRIBUTED BY HASH(`mid`) BUCKETS 10 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + streamLoad { + table "${tableName}1" + + set 'column_separator', '5b18511e' + set 'columns', """ mid,ent_id,file_md5,m2,event_time,event_date,product,combo, + file_sha1,file_sha256,file_path,file_name,file_size,file_age,file_ispe, + file_isx64,file_level,file_sublevel,file_level_sublevel,client_iswin64, + client_os_version,client_ie_version,rule_group_id,process_sign, + process_product_name,process_original_name,process_internal_name, + process_pparent_path,process_parent_path,process_parent_command_line, + process_path,process_command_line,file_dna,icon_dna,client_ip,assetid, + product_ver,clientid,process_file_size,client_id,rule_hit_all """ + + file 'test_csv_split_line1.csv' + } + + sql """sync""" + + qt_sql """select * from ${tableName}1;""" + sql """ drop table ${tableName}1; """ + + + sql """ DROP TABLE IF EXISTS ${tableName}2 """ + sql """ create table ${tableName}2 ( + a int , + b varchar(30), + c int , + d varchar(30), + ) + DUPLICATE KEY(`a`) + DISTRIBUTED BY HASH(`a`) BUCKETS 10 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + streamLoad { + table "${tableName}2" + set 'column_separator', 'hello' + file 'test_csv_split_line2.csv' + } + streamLoad { + table "${tableName}2" + set 'column_separator', '114455' + file 'test_csv_split_line3.csv' + } + + sql "sync" + qt_sql """select * from ${tableName}2 order by a;""" + + + + + + + sql """ drop table ${tableName}2; """ + +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org