This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-unstable
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 6f022b40a2e33b23fec48d36abc66621daa1e2f1
Author: camby <104178...@qq.com>
AuthorDate: Wed Nov 9 16:58:08 2022 +0800

    [refractor](array) refractor DataTypeArray from_string (#13905)
    
    refractor DataTypeArray from_string, make it more clear;
    support ',' and ']' inside string element, for example: ['hello,,,', 
'world][]']
    support empty elements, such as [,] ==> [0,0]
    Co-authored-by: cambyzju <zhuxiaol...@baidu.com>
---
 be/src/exec/broker_scanner.cpp                     |   4 -
 be/src/vec/data_types/data_type_array.cpp          | 159 ++++++++++++---------
 .../data/load_p0/broker_load/simple_array.csv      |   3 +-
 .../data/load_p0/broker_load/test_array_load.out   |   4 +
 .../data/load_p0/stream_load/test_stream_load.out  |   4 +-
 .../cast_function/test_cast_string_to_array.out    |   9 +-
 6 files changed, 108 insertions(+), 75 deletions(-)

diff --git a/be/src/exec/broker_scanner.cpp b/be/src/exec/broker_scanner.cpp
index 3d9d5cf0f2..d65b413927 100644
--- a/be/src/exec/broker_scanner.cpp
+++ b/be/src/exec/broker_scanner.cpp
@@ -495,10 +495,6 @@ Status BrokerScanner::_line_to_src_tuple(const Slice& 
line) {
         return Status::OK();
     }
 
-    if (!check_array_format(_split_values)) {
-        return Status::OK();
-    }
-
     for (int i = 0; i < _split_values.size(); ++i) {
         auto slot_desc = _src_slot_descs[i];
         const Slice& value = _split_values[i];
diff --git a/be/src/vec/data_types/data_type_array.cpp 
b/be/src/vec/data_types/data_type_array.cpp
index 9f29980e60..6827cb3779 100644
--- a/be/src/vec/data_types/data_type_array.cpp
+++ b/be/src/vec/data_types/data_type_array.cpp
@@ -175,6 +175,69 @@ std::string DataTypeArray::to_string(const IColumn& 
column, size_t row_num) cons
     return ss.str();
 }
 
+bool next_element_from_string(ReadBuffer& rb, StringRef& output, bool& 
has_quota) {
+    StringRef element(rb.position(), 0);
+    has_quota = false;
+    if (rb.eof()) {
+        return false;
+    }
+
+    // ltrim
+    while (!rb.eof() && isspace(*rb.position())) {
+        ++rb.position();
+        element.data = rb.position();
+    }
+
+    // parse string
+    if (*rb.position() == '"' || *rb.position() == '\'') {
+        const char str_sep = *rb.position();
+        size_t str_len = 1;
+        // search until next '"' or '\''
+        while (str_len < rb.count() && *(rb.position() + str_len) != str_sep) {
+            ++str_len;
+        }
+        // invalid string
+        if (str_len >= rb.count()) {
+            rb.position() = rb.end();
+            return false;
+        }
+        has_quota = true;
+        rb.position() += str_len + 1;
+        element.size += str_len + 1;
+    }
+
+    // parse array element until array separator ',' or end ']'
+    while (!rb.eof() && (*rb.position() != ',') && (rb.count() != 1 || 
*rb.position() != ']')) {
+        // invalid elements such as ["123" 456,"789" 777]
+        // correct elements such as ["123"    ,"789"    ]
+        if (has_quota && !isspace(*rb.position())) {
+            return false;
+        }
+        ++rb.position();
+        ++element.size;
+    }
+    // invalid array element
+    if (rb.eof()) {
+        return false;
+    }
+    // adjust read buffer position to first char of next array element
+    ++rb.position();
+
+    // rtrim
+    while (element.size > 0 && isspace(element.data[element.size - 1])) {
+        --element.size;
+    }
+
+    // trim '"' and '\'' for string
+    if (element.size >= 2 && (element.data[0] == '"' || element.data[0] == 
'\'') &&
+        element.data[0] == element.data[element.size - 1]) {
+        ++element.data;
+        element.size -= 2;
+    }
+    output = element;
+    return true;
+}
+
 Status DataTypeArray::from_string(ReadBuffer& rb, IColumn* column) const {
     DCHECK(!rb.eof());
     // only support one level now
@@ -191,85 +254,53 @@ Status DataTypeArray::from_string(ReadBuffer& rb, 
IColumn* column) const {
         return Status::InvalidArgument("Array does not end with ']' character, 
found '{}'",
                                        *(rb.end() - 1));
     }
+    // empty array []
+    if (rb.count() == 2) {
+        offsets.push_back(offsets.back());
+        return Status::OK();
+    }
     ++rb.position();
-    bool first = true;
-    size_t size = 0;
-    while (!rb.eof() && *rb.position() != ']') {
-        if (!first) {
-            if (*rb.position() == ',') {
-                ++rb.position();
-            } else {
-                return Status::InvalidArgument(
-                        "Cannot read array from text, expected comma or end of 
array, found '{}'",
-                        *rb.position());
-            }
-        }
-        first = false;
-        if (*rb.position() == ']') {
-            break;
-        }
-        size_t nested_str_len = 0;
-        char* temp_char = rb.position() + nested_str_len;
-        while (*(temp_char) != ']' && *(temp_char) != ',' && temp_char != 
rb.end()) {
-            ++nested_str_len;
-            temp_char = rb.position() + nested_str_len;
+
+    size_t element_num = 0;
+    // parse array element until end of array
+    while (!rb.eof()) {
+        StringRef element(rb.position(), rb.count());
+        bool has_quota = false;
+        if (!next_element_from_string(rb, element, has_quota)) {
+            return Status::InvalidArgument("Cannot read array element from 
text '{}'",
+                                           element.to_string());
         }
 
-        // dispose the case of [123,,,]
-        if (nested_str_len == 0) {
+        // handle empty element
+        if (element.size == 0) {
             auto& nested_null_col = 
reinterpret_cast<ColumnNullable&>(nested_column);
             nested_null_col.get_nested_column().insert_default();
             nested_null_col.get_null_map_data().push_back(0);
-            ++size;
+            ++element_num;
             continue;
         }
 
-        // Note: here we will trim elements, such as
-        // ["2020-09-01", "2021-09-01"  , "2022-09-01" ] ==> 
["2020-09-01","2021-09-01","2022-09-01"]
-        size_t begin_pos = 0;
-        size_t end_pos = nested_str_len - 1;
-        while (begin_pos < end_pos) {
-            if (isspace(*(rb.position() + begin_pos))) {
-                ++begin_pos;
-            } else if (isspace(*(rb.position() + end_pos))) {
-                --end_pos;
-            } else {
-                break;
-            }
-        }
-
-        // dispose the case of ["123"] or ['123']
-        bool has_quota = false;
-        size_t tmp_len = nested_str_len;
-        ReadBuffer read_buffer(rb.position(), nested_str_len);
-        auto begin_char = *(rb.position() + begin_pos);
-        auto end_char = *(rb.position() + end_pos);
-        if (begin_char == end_char && (begin_char == '"' || begin_char == 
'\'')) {
-            int64_t length = end_pos - begin_pos - 1;
-            read_buffer = ReadBuffer(rb.position() + begin_pos + 1, (length > 
0 ? length : 0));
-            tmp_len = (length > 0 ? length : 0);
-            has_quota = true;
-        }
-
-        // handle null, need to distinguish null and "null"
-        if (!has_quota && tmp_len == 4 && strncmp(read_buffer.position(), 
"null", 4) == 0) {
+        // handle null element, need to distinguish null and "null"
+        if (!has_quota && element.size == 4 && strncmp(element.data, "null", 
4) == 0) {
             // insert null
             auto& nested_null_col = 
reinterpret_cast<ColumnNullable&>(nested_column);
             nested_null_col.get_nested_column().insert_default();
             nested_null_col.get_null_map_data().push_back(1);
-        } else {
-            auto st = nested->from_string(read_buffer, &nested_column);
-            if (!st.ok()) {
-                // we should do revert if error
-                array_column->pop_back(size);
-                return st;
-            }
+            ++element_num;
+            continue;
+        }
+
+        // handle normal element
+        ReadBuffer read_buffer(const_cast<char*>(element.data), element.size);
+        auto st = nested->from_string(read_buffer, &nested_column);
+        if (!st.ok()) {
+            // we should do revert if error
+            array_column->pop_back(element_num);
+            return st;
         }
-        rb.position() += nested_str_len;
-        DCHECK_LE(rb.position(), rb.end());
-        ++size;
+        ++element_num;
     }
-    offsets.push_back(offsets.back() + size);
+    offsets.push_back(offsets.back() + element_num);
     return Status::OK();
 }
 
diff --git a/regression-test/data/load_p0/broker_load/simple_array.csv 
b/regression-test/data/load_p0/broker_load/simple_array.csv
index 0514c702de..3dbe488757 100644
--- a/regression-test/data/load_p0/broker_load/simple_array.csv
+++ b/regression-test/data/load_p0/broker_load/simple_array.csv
@@ -2,4 +2,5 @@
 
2/[6,7,8,9,10]/[32767,32768,32769]/[65534,65535,65536]/["a","b","c","d","e"]/["hello","world"]/["1991-01-01"]/["1991-01-01
 00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[1,1.2,1.3]
 
3/[]/[32767,32768,32769]/[null,null,65536]/["a","b","c","d","e"]/["happy","birthday"]/["1991-01-01"]/["1991-01-01
 00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[1,1.2,1.3]
 4/[null]/[32767,32768,32769]/[ 
null,null,65536]/["a","b","c","d","e"]/["hello","world"]/["1991-01-01"]/["1991-01-01
 00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[1,1.2,1.3]
-5/[null,null]/[32767,32768,null]/[65534,null,65536]/["a","b","c","d","e"]/["hello","world"]/["1991-01-01"]/["1991-01-01
 00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[1,1.2,1.3]
\ No newline at end of file
+5/[null,null]/[32767,32768,null]/[65534,null,65536]/["a","b","c","d","e"]/["hello","world"]/["1991-01-01"]/["1991-01-01
 00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[1,1.2,1.3]
+6/[null,null,,]/[ 32767, 32768 ,NULL]/[65534,null,65536  
]/["a","b","c","d","e"]/["hello,,,","world][]"]/[ "1991-01-01" ]/["1991-01-01 
00:00:00"]/[0.33,0.67]/[3.1415926,0.878787878]/[,]
\ No newline at end of file
diff --git a/regression-test/data/load_p0/broker_load/test_array_load.out 
b/regression-test/data/load_p0/broker_load/test_array_load.out
index 6c568b6ef1..ff4d69cbff 100644
--- a/regression-test/data/load_p0/broker_load/test_array_load.out
+++ b/regression-test/data/load_p0/broker_load/test_array_load.out
@@ -21,6 +21,7 @@
 3      []      [32767, 32768, 32769]   [NULL, NULL, 65536]     ['a', 'b', 'c', 
'd', 'e']       ['happy', 'birthday']   [1991-01-01]    [1991-01-01 00:00:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [1, 1.2, 1.3]
 4      [NULL]  [32767, 32768, 32769]   [NULL, NULL, 65536]     ['a', 'b', 'c', 
'd', 'e']       ['hello', 'world']      [1991-01-01]    [1991-01-01 00:00:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [1, 1.2, 1.3]
 5      [NULL, NULL]    [32767, 32768, NULL]    [65534, NULL, 65536]    ['a', 
'b', 'c', 'd', 'e']       ['hello', 'world']      [1991-01-01]    [1991-01-01 
00:00:00]   [0.33, 0.67]    [3.1415926, 0.878787878]        [1, 1.2, 1.3]
+6      [NULL, NULL, 0, 0]      [32767, 32768, NULL]    [65534, NULL, 65536]    
['a', 'b', 'c', 'd', 'e']       ['hello,,,', 'world][]']        [1991-01-01]    
[1991-01-01 00:00:00]   [0.33, 0.67]    [3.1415926, 0.878787878]        [0, 0]
 100    [1, 2, 3]       [32767, 32768, 32769]   [65534, 65535, 65536]   ['a', 
'b', 'c'] ['hello', 'world']      [2022-07-13]    [2022-07-13 12:30:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [4, 5.5, 6.67]
 
 -- !select --
@@ -29,6 +30,7 @@
 3      []      [32767, 32768, 32769]   [NULL, NULL, 65536]     ['a', 'b', 'c', 
'd', 'e']       ['happy', 'birthday']   [1991-01-01]    [1991-01-01 00:00:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [1.000000, 1.200000, 1.300000]
 4      [NULL]  [32767, 32768, 32769]   [NULL, NULL, 65536]     ['a', 'b', 'c', 
'd', 'e']       ['hello', 'world']      [1991-01-01]    [1991-01-01 00:00:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [1.000000, 1.200000, 1.300000]
 5      [NULL, NULL]    [32767, 32768, NULL]    [65534, NULL, 65536]    ['a', 
'b', 'c', 'd', 'e']       ['hello', 'world']      [1991-01-01]    [1991-01-01 
00:00:00]   [0.33, 0.67]    [3.1415926, 0.878787878]        [1.000000, 
1.200000, 1.300000]
+6      [NULL, NULL, 0, 0]      [32767, 32768, NULL]    [65534, NULL, 65536]    
['a', 'b', 'c', 'd', 'e']       ['hello,,,', 'world][]']        [1991-01-01]    
[1991-01-01 00:00:00]   [0.33, 0.67]    [3.1415926, 0.878787878]        
[0.000000, 0.000000]
 100    [1, 2, 3]       [32767, 32768, 32769]   [65534, 65535, 65536]   ['a', 
'b', 'c'] ['hello', 'world']      [2022-07-13]    [2022-07-13 12:30:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [4.000000, 5.500000, 6.670000]
 
 -- !select --
@@ -61,6 +63,7 @@
 3      []      [32767, 32768, 32769]   [NULL, NULL, 65536]     ['a', 'b', 'c', 
'd', 'e']       ['happy', 'birthday']   [1991-01-01]    [1991-01-01 00:00:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [1, 1.2, 1.3]
 4      [NULL]  [32767, 32768, 32769]   [NULL, NULL, 65536]     ['a', 'b', 'c', 
'd', 'e']       ['hello', 'world']      [1991-01-01]    [1991-01-01 00:00:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [1, 1.2, 1.3]
 5      [NULL, NULL]    [32767, 32768, NULL]    [65534, NULL, 65536]    ['a', 
'b', 'c', 'd', 'e']       ['hello', 'world']      [1991-01-01]    [1991-01-01 
00:00:00]   [0.33, 0.67]    [3.1415926, 0.878787878]        [1, 1.2, 1.3]
+6      [NULL, NULL, 0, 0]      [32767, 32768, NULL]    [65534, NULL, 65536]    
['a', 'b', 'c', 'd', 'e']       ['hello,,,', 'world][]']        [1991-01-01]    
[1991-01-01 00:00:00]   [0.33, 0.67]    [3.1415926, 0.878787878]        [0, 0]
 100    [1, 2, 3]       [32767, 32768, 32769]   [65534, 65535, 65536]   ['a', 
'b', 'c'] ['hello', 'world']      [2022-07-13]    [2022-07-13 12:30:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [4, 5.5, 6.67]
 
 -- !select --
@@ -69,6 +72,7 @@
 3      []      [32767, 32768, 32769]   [NULL, NULL, 65536]     ['a', 'b', 'c', 
'd', 'e']       ['happy', 'birthday']   [1991-01-01]    [1991-01-01 00:00:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [1.000000, 1.200000, 1.300000]
 4      [NULL]  [32767, 32768, 32769]   [NULL, NULL, 65536]     ['a', 'b', 'c', 
'd', 'e']       ['hello', 'world']      [1991-01-01]    [1991-01-01 00:00:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [1.000000, 1.200000, 1.300000]
 5      [NULL, NULL]    [32767, 32768, NULL]    [65534, NULL, 65536]    ['a', 
'b', 'c', 'd', 'e']       ['hello', 'world']      [1991-01-01]    [1991-01-01 
00:00:00]   [0.33, 0.67]    [3.1415926, 0.878787878]        [1.000000, 
1.200000, 1.300000]
+6      [NULL, NULL, 0, 0]      [32767, 32768, NULL]    [65534, NULL, 65536]    
['a', 'b', 'c', 'd', 'e']       ['hello,,,', 'world][]']        [1991-01-01]    
[1991-01-01 00:00:00]   [0.33, 0.67]    [3.1415926, 0.878787878]        
[0.000000, 0.000000]
 100    [1, 2, 3]       [32767, 32768, 32769]   [65534, 65535, 65536]   ['a', 
'b', 'c'] ['hello', 'world']      [2022-07-13]    [2022-07-13 12:30:00]   
[0.33, 0.67]    [3.1415926, 0.878787878]        [4.000000, 5.500000, 6.670000]
 
 -- !select --
diff --git a/regression-test/data/load_p0/stream_load/test_stream_load.out 
b/regression-test/data/load_p0/stream_load/test_stream_load.out
index 162f76ca74..85a8835f51 100644
--- a/regression-test/data/load_p0/stream_load/test_stream_load.out
+++ b/regression-test/data/load_p0/stream_load/test_stream_load.out
@@ -4,7 +4,7 @@
 -2     -50     \N      1       \N      \N      \N      \N      \N      \N      
\N      \N      j       \N
 
 -- !sql1 --
-2019   9       9       9       7.7     a       2019-09-09      
1970-01-01T08:33:39     k7      9.0     9.0
+2019   9       9       9       7.700   a       2019-09-09      
1970-01-01T08:33:39     k7      9.0     9.0
 
 -- !all11 --
 2500
@@ -67,5 +67,5 @@
 6      [NULL, NULL]    [NULL, NULL]    [NULL, NULL]    [NULL, NULL]    [NULL, 
'null']  [NULL, NULL]    [NULL, NULL]    [NULL, NULL]    [NULL, NULL]    [NULL, 
NULL, NULL, NULL, NULL, NULL]
 6      [NULL, NULL]    [NULL, NULL]    [NULL, NULL]    [NULL, NULL]    [NULL, 
NULL]    [NULL, NULL]    [NULL, NULL]    [NULL, NULL]    [NULL, NULL]    [NULL, 
NULL, NULL, NULL, NULL, NULL]
 7      [1, 2, 3, 4, 5] \N      \N      \N      \N      \N      \N      \N      
\N      \N
-8      [1, 2, 3, 4, 5] \N      \N      \N      \N      \N      []      \N      
[NULL]  \N
+8      [1, 2, 3, 4, 5] \N      \N      \N      \N      \N      [NULL]  \N      
[NULL]  \N
 
diff --git 
a/regression-test/data/query_p0/sql_functions/cast_function/test_cast_string_to_array.out
 
b/regression-test/data/query_p0/sql_functions/cast_function/test_cast_string_to_array.out
index 1eb1418926..90d372c356 100644
--- 
a/regression-test/data/query_p0/sql_functions/cast_function/test_cast_string_to_array.out
+++ 
b/regression-test/data/query_p0/sql_functions/cast_function/test_cast_string_to_array.out
@@ -15,13 +15,14 @@
 [2022-09-01]
 
 -- !sql --
-[1, 2, 3, 0, 0]
+[1, 2, 3, 0, 0, 0]
 
 -- !sql --
-['a', 'b', 'c', '', '']
+['a', 'b', 'c', '', '', '']
 
 -- !sql --
-[1.34, 2.01, 0, 0]
+[1.34, 2.01, 0, 0, 0]
 
 -- !sql --
-[2022-09-01, 0000-00-00]
+[2022-09-01, 0000-00-00, 0000-00-00]
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to