This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-1.2-unstable in repository https://gitbox.apache.org/repos/asf/doris.git
commit 6f022b40a2e33b23fec48d36abc66621daa1e2f1 Author: camby <104178...@qq.com> AuthorDate: Wed Nov 9 16:58:08 2022 +0800 [refractor](array) refractor DataTypeArray from_string (#13905) refractor DataTypeArray from_string, make it more clear; support ',' and ']' inside string element, for example: ['hello,,,', 'world][]'] support empty elements, such as [,] ==> [0,0] Co-authored-by: cambyzju <zhuxiaol...@baidu.com> --- be/src/exec/broker_scanner.cpp | 4 - be/src/vec/data_types/data_type_array.cpp | 159 ++++++++++++--------- .../data/load_p0/broker_load/simple_array.csv | 3 +- .../data/load_p0/broker_load/test_array_load.out | 4 + .../data/load_p0/stream_load/test_stream_load.out | 4 +- .../cast_function/test_cast_string_to_array.out | 9 +- 6 files changed, 108 insertions(+), 75 deletions(-) diff --git a/be/src/exec/broker_scanner.cpp b/be/src/exec/broker_scanner.cpp index 3d9d5cf0f2..d65b413927 100644 --- a/be/src/exec/broker_scanner.cpp +++ b/be/src/exec/broker_scanner.cpp @@ -495,10 +495,6 @@ Status BrokerScanner::_line_to_src_tuple(const Slice& line) { return Status::OK(); } - if (!check_array_format(_split_values)) { - return Status::OK(); - } - for (int i = 0; i < _split_values.size(); ++i) { auto slot_desc = _src_slot_descs[i]; const Slice& value = _split_values[i]; diff --git a/be/src/vec/data_types/data_type_array.cpp b/be/src/vec/data_types/data_type_array.cpp index 9f29980e60..6827cb3779 100644 --- a/be/src/vec/data_types/data_type_array.cpp +++ b/be/src/vec/data_types/data_type_array.cpp @@ -175,6 +175,69 @@ std::string DataTypeArray::to_string(const IColumn& column, size_t row_num) cons return ss.str(); } +bool next_element_from_string(ReadBuffer& rb, StringRef& output, bool& has_quota) { + StringRef element(rb.position(), 0); + has_quota = false; + if (rb.eof()) { + return false; + } + + // ltrim + while (!rb.eof() && isspace(*rb.position())) { + ++rb.position(); + element.data = rb.position(); + } + + // parse string + if (*rb.position() == '"' || *rb.position() == '\'') { + const char str_sep = *rb.position(); + size_t str_len = 1; + // search until next '"' or '\'' + while (str_len < rb.count() && *(rb.position() + str_len) != str_sep) { + ++str_len; + } + // invalid string + if (str_len >= rb.count()) { + rb.position() = rb.end(); + return false; + } + has_quota = true; + rb.position() += str_len + 1; + element.size += str_len + 1; + } + + // parse array element until array separator ',' or end ']' + while (!rb.eof() && (*rb.position() != ',') && (rb.count() != 1 || *rb.position() != ']')) { + // invalid elements such as ["123" 456,"789" 777] + // correct elements such as ["123" ,"789" ] + if (has_quota && !isspace(*rb.position())) { + return false; + } + ++rb.position(); + ++element.size; + } + // invalid array element + if (rb.eof()) { + return false; + } + // adjust read buffer position to first char of next array element + ++rb.position(); + + // rtrim + while (element.size > 0 && isspace(element.data[element.size - 1])) { + --element.size; + } + + // trim '"' and '\'' for string + if (element.size >= 2 && (element.data[0] == '"' || element.data[0] == '\'') && + element.data[0] == element.data[element.size - 1]) { + ++element.data; + element.size -= 2; + } + output = element; + return true; +} + Status DataTypeArray::from_string(ReadBuffer& rb, IColumn* column) const { DCHECK(!rb.eof()); // only support one level now @@ -191,85 +254,53 @@ Status DataTypeArray::from_string(ReadBuffer& rb, IColumn* column) const { return Status::InvalidArgument("Array does not end with ']' character, found '{}'", *(rb.end() - 1)); } + // empty array [] + if (rb.count() == 2) { + offsets.push_back(offsets.back()); + return Status::OK(); + } ++rb.position(); - bool first = true; - size_t size = 0; - while (!rb.eof() && *rb.position() != ']') { - if (!first) { - if (*rb.position() == ',') { - ++rb.position(); - } else { - return Status::InvalidArgument( - "Cannot read array from text, expected comma or end of array, found '{}'", - *rb.position()); - } - } - first = false; - if (*rb.position() == ']') { - break; - } - size_t nested_str_len = 0; - char* temp_char = rb.position() + nested_str_len; - while (*(temp_char) != ']' && *(temp_char) != ',' && temp_char != rb.end()) { - ++nested_str_len; - temp_char = rb.position() + nested_str_len; + + size_t element_num = 0; + // parse array element until end of array + while (!rb.eof()) { + StringRef element(rb.position(), rb.count()); + bool has_quota = false; + if (!next_element_from_string(rb, element, has_quota)) { + return Status::InvalidArgument("Cannot read array element from text '{}'", + element.to_string()); } - // dispose the case of [123,,,] - if (nested_str_len == 0) { + // handle empty element + if (element.size == 0) { auto& nested_null_col = reinterpret_cast<ColumnNullable&>(nested_column); nested_null_col.get_nested_column().insert_default(); nested_null_col.get_null_map_data().push_back(0); - ++size; + ++element_num; continue; } - // Note: here we will trim elements, such as - // ["2020-09-01", "2021-09-01" , "2022-09-01" ] ==> ["2020-09-01","2021-09-01","2022-09-01"] - size_t begin_pos = 0; - size_t end_pos = nested_str_len - 1; - while (begin_pos < end_pos) { - if (isspace(*(rb.position() + begin_pos))) { - ++begin_pos; - } else if (isspace(*(rb.position() + end_pos))) { - --end_pos; - } else { - break; - } - } - - // dispose the case of ["123"] or ['123'] - bool has_quota = false; - size_t tmp_len = nested_str_len; - ReadBuffer read_buffer(rb.position(), nested_str_len); - auto begin_char = *(rb.position() + begin_pos); - auto end_char = *(rb.position() + end_pos); - if (begin_char == end_char && (begin_char == '"' || begin_char == '\'')) { - int64_t length = end_pos - begin_pos - 1; - read_buffer = ReadBuffer(rb.position() + begin_pos + 1, (length > 0 ? length : 0)); - tmp_len = (length > 0 ? length : 0); - has_quota = true; - } - - // handle null, need to distinguish null and "null" - if (!has_quota && tmp_len == 4 && strncmp(read_buffer.position(), "null", 4) == 0) { + // handle null element, need to distinguish null and "null" + if (!has_quota && element.size == 4 && strncmp(element.data, "null", 4) == 0) { // insert null auto& nested_null_col = reinterpret_cast<ColumnNullable&>(nested_column); nested_null_col.get_nested_column().insert_default(); nested_null_col.get_null_map_data().push_back(1); - } else { - auto st = nested->from_string(read_buffer, &nested_column); - if (!st.ok()) { - // we should do revert if error - array_column->pop_back(size); - return st; - } + ++element_num; + continue; + } + + // handle normal element + ReadBuffer read_buffer(const_cast<char*>(element.data), element.size); + auto st = nested->from_string(read_buffer, &nested_column); + if (!st.ok()) { + // we should do revert if error + array_column->pop_back(element_num); + return st; } - rb.position() += nested_str_len; - DCHECK_LE(rb.position(), rb.end()); - ++size; + ++element_num; } - offsets.push_back(offsets.back() + size); + offsets.push_back(offsets.back() + element_num); return Status::OK(); } diff --git a/regression-test/data/load_p0/broker_load/simple_array.csv b/regression-test/data/load_p0/broker_load/simple_array.csv index 0514c702de..3dbe488757 100644 --- a/regression-test/data/load_p0/broker_load/simple_array.csv +++ b/regression-test/data/load_p0/broker_load/simple_array.csv @@ -2,4 +2,5 @@ 2/[6,7,8,9,10]/[32767,32768,32769]/[65534,65535,65536]/["a","b","c","d","e"]/["hello","world"]/["1991-01-01"]/["1991-01-01 00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[1,1.2,1.3] 3/[]/[32767,32768,32769]/[null,null,65536]/["a","b","c","d","e"]/["happy","birthday"]/["1991-01-01"]/["1991-01-01 00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[1,1.2,1.3] 4/[null]/[32767,32768,32769]/[ null,null,65536]/["a","b","c","d","e"]/["hello","world"]/["1991-01-01"]/["1991-01-01 00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[1,1.2,1.3] -5/[null,null]/[32767,32768,null]/[65534,null,65536]/["a","b","c","d","e"]/["hello","world"]/["1991-01-01"]/["1991-01-01 00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[1,1.2,1.3] \ No newline at end of file +5/[null,null]/[32767,32768,null]/[65534,null,65536]/["a","b","c","d","e"]/["hello","world"]/["1991-01-01"]/["1991-01-01 00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[1,1.2,1.3] +6/[null,null,,]/[ 32767, 32768 ,NULL]/[65534,null,65536 ]/["a","b","c","d","e"]/["hello,,,","world][]"]/[ "1991-01-01" ]/["1991-01-01 00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[,] \ No newline at end of file diff --git a/regression-test/data/load_p0/broker_load/test_array_load.out b/regression-test/data/load_p0/broker_load/test_array_load.out index 6c568b6ef1..ff4d69cbff 100644 --- a/regression-test/data/load_p0/broker_load/test_array_load.out +++ b/regression-test/data/load_p0/broker_load/test_array_load.out @@ -21,6 +21,7 @@ 3 [] [32767, 32768, 32769] [NULL, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['happy', 'birthday'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1, 1.2, 1.3] 4 [NULL] [32767, 32768, 32769] [NULL, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello', 'world'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1, 1.2, 1.3] 5 [NULL, NULL] [32767, 32768, NULL] [65534, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello', 'world'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1, 1.2, 1.3] +6 [NULL, NULL, 0, 0] [32767, 32768, NULL] [65534, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello,,,', 'world][]'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [0, 0] 100 [1, 2, 3] [32767, 32768, 32769] [65534, 65535, 65536] ['a', 'b', 'c'] ['hello', 'world'] [2022-07-13] [2022-07-13 12:30:00] [0.33, 0.67] [3.1415926, 0.878787878] [4, 5.5, 6.67] -- !select -- @@ -29,6 +30,7 @@ 3 [] [32767, 32768, 32769] [NULL, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['happy', 'birthday'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1.000000, 1.200000, 1.300000] 4 [NULL] [32767, 32768, 32769] [NULL, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello', 'world'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1.000000, 1.200000, 1.300000] 5 [NULL, NULL] [32767, 32768, NULL] [65534, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello', 'world'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1.000000, 1.200000, 1.300000] +6 [NULL, NULL, 0, 0] [32767, 32768, NULL] [65534, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello,,,', 'world][]'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [0.000000, 0.000000] 100 [1, 2, 3] [32767, 32768, 32769] [65534, 65535, 65536] ['a', 'b', 'c'] ['hello', 'world'] [2022-07-13] [2022-07-13 12:30:00] [0.33, 0.67] [3.1415926, 0.878787878] [4.000000, 5.500000, 6.670000] -- !select -- @@ -61,6 +63,7 @@ 3 [] [32767, 32768, 32769] [NULL, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['happy', 'birthday'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1, 1.2, 1.3] 4 [NULL] [32767, 32768, 32769] [NULL, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello', 'world'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1, 1.2, 1.3] 5 [NULL, NULL] [32767, 32768, NULL] [65534, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello', 'world'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1, 1.2, 1.3] +6 [NULL, NULL, 0, 0] [32767, 32768, NULL] [65534, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello,,,', 'world][]'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [0, 0] 100 [1, 2, 3] [32767, 32768, 32769] [65534, 65535, 65536] ['a', 'b', 'c'] ['hello', 'world'] [2022-07-13] [2022-07-13 12:30:00] [0.33, 0.67] [3.1415926, 0.878787878] [4, 5.5, 6.67] -- !select -- @@ -69,6 +72,7 @@ 3 [] [32767, 32768, 32769] [NULL, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['happy', 'birthday'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1.000000, 1.200000, 1.300000] 4 [NULL] [32767, 32768, 32769] [NULL, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello', 'world'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1.000000, 1.200000, 1.300000] 5 [NULL, NULL] [32767, 32768, NULL] [65534, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello', 'world'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [1.000000, 1.200000, 1.300000] +6 [NULL, NULL, 0, 0] [32767, 32768, NULL] [65534, NULL, 65536] ['a', 'b', 'c', 'd', 'e'] ['hello,,,', 'world][]'] [1991-01-01] [1991-01-01 00:00:00] [0.33, 0.67] [3.1415926, 0.878787878] [0.000000, 0.000000] 100 [1, 2, 3] [32767, 32768, 32769] [65534, 65535, 65536] ['a', 'b', 'c'] ['hello', 'world'] [2022-07-13] [2022-07-13 12:30:00] [0.33, 0.67] [3.1415926, 0.878787878] [4.000000, 5.500000, 6.670000] -- !select -- diff --git a/regression-test/data/load_p0/stream_load/test_stream_load.out b/regression-test/data/load_p0/stream_load/test_stream_load.out index 162f76ca74..85a8835f51 100644 --- a/regression-test/data/load_p0/stream_load/test_stream_load.out +++ b/regression-test/data/load_p0/stream_load/test_stream_load.out @@ -4,7 +4,7 @@ -2 -50 \N 1 \N \N \N \N \N \N \N \N j \N -- !sql1 -- -2019 9 9 9 7.7 a 2019-09-09 1970-01-01T08:33:39 k7 9.0 9.0 +2019 9 9 9 7.700 a 2019-09-09 1970-01-01T08:33:39 k7 9.0 9.0 -- !all11 -- 2500 @@ -67,5 +67,5 @@ 6 [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, 'null'] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL, NULL, NULL, NULL, NULL] 6 [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL] [NULL, NULL, NULL, NULL, NULL, NULL] 7 [1, 2, 3, 4, 5] \N \N \N \N \N \N \N \N \N -8 [1, 2, 3, 4, 5] \N \N \N \N \N [] \N [NULL] \N +8 [1, 2, 3, 4, 5] \N \N \N \N \N [NULL] \N [NULL] \N diff --git a/regression-test/data/query_p0/sql_functions/cast_function/test_cast_string_to_array.out b/regression-test/data/query_p0/sql_functions/cast_function/test_cast_string_to_array.out index 1eb1418926..90d372c356 100644 --- a/regression-test/data/query_p0/sql_functions/cast_function/test_cast_string_to_array.out +++ b/regression-test/data/query_p0/sql_functions/cast_function/test_cast_string_to_array.out @@ -15,13 +15,14 @@ [2022-09-01] -- !sql -- -[1, 2, 3, 0, 0] +[1, 2, 3, 0, 0, 0] -- !sql -- -['a', 'b', 'c', '', ''] +['a', 'b', 'c', '', '', ''] -- !sql -- -[1.34, 2.01, 0, 0] +[1.34, 2.01, 0, 0, 0] -- !sql -- -[2022-09-01, 0000-00-00] +[2022-09-01, 0000-00-00, 0000-00-00] + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org