This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new c704497d02 [fix](csv_reader)Fixed bug when parsing multi-character delimiters. (#24572) c704497d02 is described below commit c704497d021016d4c8e6087287b90c1dc646f8be Author: daidai <2017501...@qq.com> AuthorDate: Wed Sep 20 12:41:35 2023 +0800 [fix](csv_reader)Fixed bug when parsing multi-character delimiters. (#24572) Fixed bug when parsing multi-character delimiters. --- be/src/vec/exec/format/csv/csv_reader.cpp | 22 ++++++++++-- .../load_p0/stream_load/test_csv_split_line.out | 42 ++++++++++++++++++++++ .../load_p0/stream_load/test_csv_split_line2.csv | 5 ++- .../load_p0/stream_load/test_csv_split_line3.csv | 3 ++ .../load_p0/stream_load/test_csv_split_line4.csv | 16 +++++++++ .../load_p0/stream_load/test_csv_split_line.groovy | 41 +++++++++++++++++++++ 6 files changed, 126 insertions(+), 3 deletions(-) diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index eeb3aac416..93769a97c9 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -139,9 +139,27 @@ void PlainCsvTextFieldSplitter::_split_field_multi_char(const Slice& line, if (j == value_sep_len - 1) { curpos = i - value_sep_len + 1; - process_value_func(line.data, start, curpos - start, trimming_char, splitted_values); + /* + * column_separator : "xx" + * data.csv : data1xxxxdata2 + * + * Parse incorrectly: + * data1[xx]xxdata2 + * data1x[xx]xdata2 + * data1xx[xx]data2 + * The string "xxxx" is parsed into three "xx" delimiters. + * + * Parse correctly: + * data1[xx]xxdata2 + * data1xx[xx]data2 + */ + + if (curpos >= start) { + process_value_func(line.data, start, curpos - start, trimming_char, + splitted_values); + start = i + 1; + } - start = i + 1; j = next[j]; } } diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line.out b/regression-test/data/load_p0/stream_load/test_csv_split_line.out index 7a97bc9314..0b16a8f480 100644 --- a/regression-test/data/load_p0/stream_load/test_csv_split_line.out +++ b/regression-test/data/load_p0/stream_load/test_csv_split_line.out @@ -4,11 +4,53 @@ -- !sql -- 1000 worldhell 10000000 ello +1111 22131 123123 0000000 2000 wohellhell 200000 ellohell +2222 \N \N \N 3000 worellohell 30000000 elloab 4000 hellwohellhell \N abcdeeelhllo +7777 \N 10001 helloword 114466 0000011445 +33333 00 11111 00000 +44444 00 11111 55555 \N 14455 7711445777 66666 \N \N 113355 77777 0011455 8888 114545 +99999 \N + +-- !sql -- +1 USER 13456 430,431,6418,419,31,341,420,421,7,428,429 0 2023-09-13T09:55:32 +10 \N 1 \N +11 \N \N 2023-09-13T09:57:32 +12 abc 21 1 \N +13 \N 22 1 \N +14 \N \N \N \N \N +15 112 \N 1231 \N \N +16 1 \N 1231 \N \N +2 USER 642836 68,260,257,334,30,218,308,309,31,75 0 2023-09-13T09:57:32 +3 CLASS 366 0 2023-09-13T09:57:32 +4 CLASS 10207 0 2023-09-13T09:57:32 +5 CLASS 111 \N \N +6 USER 1 11 \N \N +7 USER 1 11 \N 2023-09-13T09:57:32 +8 \N \N \N +9 \N 1 \N \N + +-- !sql -- +10 \N 1 \N +11 \N \N 2023-09-13T09:57:32 +12 abc 21 1 \N +3 CLASS 366 0 2023-09-13T09:57:32 +4 CLASS 10207 0 2023-09-13T09:57:32 +5 CLASS 111 \N \N +8 \N \N \N + +-- !sql -- +10 \N 1 \N +11 \N \N 2023-09-13T09:57:32 +12 abc 21 1 \N +3 CLASS 366 0 2023-09-13T09:57:32 +4 CLASS 10207 0 2023-09-13T09:57:32 +5 CLASS 111 \N \N +8 \N \N \N diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv index 04ba509ae4..94340cebd1 100644 --- a/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv +++ b/regression-test/data/load_p0/stream_load/test_csv_split_line2.csv @@ -1,4 +1,7 @@ 1000helloworldhellhello10000000helloello 2000hellowohellhellhello200000helloellohell 3000helloworellohellhello30000000helloelloab -4000hellohellwohellhellhello\Nhelloabcdeeelhllo \ No newline at end of file +4000hellohellwohellhellhello\Nhelloabcdeeelhllo +"1111"hello"22131"hello"123123"hello0000000 +2222hello\Nhello\Nhello\N +7777hellohellohello \ No newline at end of file diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv index 4332f6b90e..f2bb26a8fb 100644 --- a/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv +++ b/regression-test/data/load_p0/stream_load/test_csv_split_line3.csv @@ -2,3 +2,6 @@ 55555114455\N114455144551144557711445777 66666114455\N114455\N114455113355 7777711445500114551144558888114455114545 +99999114455114455114455 +33333114455001144551111111445500000 +444441144550011445511111114455 \ No newline at end of file diff --git a/regression-test/data/load_p0/stream_load/test_csv_split_line4.csv b/regression-test/data/load_p0/stream_load/test_csv_split_line4.csv new file mode 100644 index 0000000000..8956ed41be --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_csv_split_line4.csv @@ -0,0 +1,16 @@ +1||USER||13456||430,431,6418,419,31,341,420,421,7,428,429||0||2023-09-13 09:55:32 +2||USER||642836||68,260,257,334,30,218,308,309,31,75||0||2023-09-13 09:57:32 +3||CLASS||366||||0||2023-09-13 09:57:32 +4||CLASS||10207||||0||2023-09-13 09:57:32 +5||CLASS||111|||||| +6||USER||1||11|||| +7||USER||1||11||||2023-09-13 09:57:32 +8|||||||||| +9||||||1|||| +10||||||||1|| +11||||||||||2023-09-13 09:57:32 +12||abc||21||||1|| +13||||||22||1|| +14||\N||\N||\N||\N||\N +15||112||||1231|||| +16||1||||1231|||| \ No newline at end of file diff --git a/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy index c3b786bfc7..47bd8c3bbc 100644 --- a/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy +++ b/regression-test/suites/load_p0/stream_load/test_csv_split_line.groovy @@ -108,6 +108,7 @@ suite("test_csv_split_line", "p0") { streamLoad { table "${tableName}2" set 'column_separator', 'hello' + set 'trim_double_quotes', 'true' file 'test_csv_split_line2.csv' } streamLoad { @@ -124,4 +125,44 @@ suite("test_csv_split_line", "p0") { sql """ drop table ${tableName}2; """ + sql """ DROP TABLE IF EXISTS ${tableName}3 """ + sql """ create table ${tableName}3 ( + `user_id` bigint(20) NULL, + `tag_type` varchar(20) NULL , + `tag_owner_id` bigint(20) NULL, + `tag_value` text NULL , + `deleted` tinyint(4) NULL , + `create_time` datetime NULL DEFAULT CURRENT_TIMESTAMP + ) ENGINE=OLAP + UNIQUE KEY(`user_id`, `tag_type`, `tag_owner_id`) + DISTRIBUTED BY HASH(`user_id`) BUCKETS 20 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "is_being_synced" = "false", + "colocate_with" = "__global__crm_user_group", + "storage_format" = "V2", + "enable_unique_key_merge_on_write" = "true", + "disable_auto_compaction" = "false", + "enable_single_replica_compaction" = "false" + ); + """ + + streamLoad { + table "${tableName}3" + set 'column_separator', '||' + file 'test_csv_split_line4.csv' + } + order_qt_sql """ + select * from ${tableName}3 order by user_id; + """ + + order_qt_sql """ + select * from ${tableName}3 where tag_value="" order by user_id; + """ + order_qt_sql """ + select * from ${tableName}3 where tag_value="" order by user_id; + """ + + sql """ drop table ${tableName}3; """ + } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org