Re: [PR] [FIX](complextype)fix struct nested complex collection type and and regresstest [doris]

via GitHub Sun, 19 Nov 2023 17:37:55 -0800


xiaokang commented on code in PR #26973:
URL: https://github.com/apache/doris/pull/26973#discussion_r1396690419



##########
regression-test/suites/datatype_p0/nested_types/base_cases/one_level_nestedtypes_with_s3data.groovy:
##########
@@ -0,0 +1,283 @@
+import org.apache.commons.lang3.StringUtils
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("one_level_nestedtypes_with_s3data") {
+    sql """set enable_nereids_planner=false"""
+    sql """ set enable_fallback_to_original_planner=true;"""
+    // this test case aim to test one-level nested type with s3 data
+
+
+    String ak = getS3AK()
+    String sk = getS3SK()
+    String s3_endpoint = getS3Endpoint()
+    String bucket = context.config.otherConfigs.get("s3BucketName");
+
+
+    def dataFilePath = 
"https://"+"${bucket}"+"."+"${s3_endpoint}"+"/regression/datalake";
+//    def dataFilePath = "/mnt/disk1/wangqiannan/export/ol"
+    def table_names = ["test_array_one_level", "test_map_one_level", 
"test_struct_one_level"]
+
+    def colNameArr = ["c_bool", "c_tinyint", "c_smallint", "c_int", 
"c_bigint", "c_largeint", "c_float",
+                      "c_double", "c_decimal", "c_decimalv3", "c_date", 
"c_datetime", "c_datev2", "c_datetimev2",
+                      "c_char", "c_varchar", "c_string"]
+
+    def groupby_or_orderby_exception = {is_groupby, table_name, col_name ->
+        test {
+            if (is_groupby) {
+                sql "select ${col_name} from ${table_name} group by 
${col_name};"
+            } else {
+                sql "select ${col_name} from ${table_name} order by 
${col_name};"
+            }
+            exception("errCode = 2, detailMessage = Doris hll, bitmap, array, 
map, struct, jsonb, variant column must use with specific function, and don't 
support filter, group by or order by")
+        }
+    }
+
+    def groupby_or_orderby_element_at = {is_groupby, table_name, agg_expr ->
+        if (is_groupby) {
+            order_qt_sql "select ${agg_expr} from ${table_name} where k1 IS 
NOT NULL group by ${agg_expr};"
+        } else {
+            order_qt_sql "select ${agg_expr} from ${table_name} where k1 IS 
NOT NULL order by ${agg_expr} limit 10;"
+        }
+    }
+
+    def be_id = 10139
+    def load_from_tvf = {table_name, uri_file, format ->
+        if (format == "csv") {
+            order_qt_sql_tvf """select * from local(
+                "file_path" = "${uri_file}",
+                "backend_id" = "${be_id}",
+                "column_separator"="|",
+                "format" = "${format}") order by c1 limit 10; """
+
+            sql """
+            insert into ${table_name} select * from local(
+            "file_path" = "${uri_file}",
+            "backend_id" = "${be_id}",
+            "column_separator"="|",
+            "format" = "${format}"); """
+        } else {
+            order_qt_sql_tvf """select * from local(
+                "file_path" = "${uri_file}",
+                "backend_id" = "${be_id}",
+                "column_separator"="|",
+                "format" = "${format}") order by k1 limit 10; """
+
+            sql """
+            insert into ${table_name} select * from local(
+            "file_path" = "${uri_file}",
+            "backend_id" = "${be_id}",
+            "column_separator"="|",
+            "format" = "${format}"); """
+        }
+        // where to filter different format data
+        qt_select_doris """ select * from ${table_name} where k1 IS NOT NULL 
order by k1 limit 10; """
+    }
+    def load_from_s3 = {table_name, uri_file, format ->
+        if (format == "csv") {
+            order_qt_sql_s3 """select * from s3(
+                "uri" = "${uri_file}",
+                    "s3.access_key"= "${ak}",
+                    "s3.secret_key" = "${sk}",
+                    "format" = "${format}",
+                    "column_separator"="|",
+                    "read_json_by_line"="true") order by c1 limit 10; """
+
+            sql """
+            insert into ${table_name} select * from s3(
+            "uri" = "${uri_file}",
+                    "s3.access_key"= "${ak}",
+                    "s3.secret_key" = "${sk}",
+                    "format" = "${format}",
+                    "column_separator"="|",
+                    "read_json_by_line"="true"); """
+        } else {
+            order_qt_sql_s3 """select * from s3(
+                "uri" = "${uri_file}",
+                    "s3.access_key"= "${ak}",
+                    "s3.secret_key" = "${sk}",
+                    "format" = "${format}",
+                    "read_json_by_line"="true") order by k1 limit 10; """
+
+            sql """
+            insert into ${table_name} select * from s3(
+            "uri" = "${uri_file}",
+                    "s3.access_key"= "${ak}",
+                    "s3.secret_key" = "${sk}",
+                    "format" = "${format}",
+                    "read_json_by_line"="true"); """
+        }
+        // where to filter different format data
+        qt_select_doris """ select * from ${table_name} where k1 IS NOT NULL 
order by k1 limit 10; """
+    }
+
+    // step1. create table
+    // step2. load from s3
+    //      step 2.1 format: parquet|orc|json|csv
+    // step3. select *
+    // step4. select element_at(column in first, -1(last), null, 0)
+    // step5. select * from table where element_at(column) equals expr just ok
+    // step6. select * from table where groupby|orderby column
+    // step7. select * from table where groupby|orderby element_at(column)
+
+    def format_order = [
+            "parquet", "orc",
+//            "json",
+            "csv"]
+    // create tables
+    for (int i = 0; i < table_names.size(); ++i) {
+        sql """ DROP TABLE IF EXISTS ${table_names[i]} """
+        String result = create_table_with_nested_type(1, [i], table_names[i])
+        sql result
+    }
+
+    //========================= ARRAY =========================
+    // insert into doris table
+    ArrayList<String> array_files = [
+                                    "${dataFilePath}/one_level_array.parquet",
+                                    "${dataFilePath}/one_level_array.orc",
+//                                    "${dataFilePath}/one_level_array.json",
+                                    "${dataFilePath}/one_level_array.csv"
+    ]
+    int fi = 0
+    for (String f : array_files) {
+        sql "truncate table ${table_names[2]};"
+//        load_from_tvf(table_names[0], f, format_order[fi])
+        load_from_s3(table_names[0], f, format_order[fi])
+        ++ fi
+    }
+    // select element_at(column)
+    for (String col : colNameArr) {
+        // first
+        order_qt_select_arr "select ${col}[1] from ${table_names[0]} where k1 
IS NOT NULL order by k1 limit 10;"
+        // last
+        order_qt_select_arr "select ${col}[-1] from ${table_names[0]} where k1 
IS NOT NULL order by k1 limit 10;"
+        // null
+        order_qt_select_arr_null "select ${col}[0] from ${table_names[0]} 
where k1 IS NOT NULL order by k1 limit 10;"
+        // null
+        order_qt_select_arr_null "select ${col}[1000] from ${table_names[0]} 
where k1 IS NOT NULL order by k1 limit 10;"
+    }
+    // select * from table where element_at(column) with equal expr
+    for (String col : colNameArr) {
+        order_qt_select_arr "select ${col}[1], ${col}[-1] from 
${table_names[0]} where k1 IS NOT NULL AND ${col}[1]<${col}[-1] order by k1 
limit 10;"
+    }
+    // select * from table where groupby|orderby column will meet exception
+    for (String col : colNameArr) {
+        groupby_or_orderby_exception(true, table_names[0], col)
+        groupby_or_orderby_exception(false, table_names[0], col)
+    }
+    // select * from table where groupby|orderby element_at(column)
+    for (String col : colNameArr) {
+        String agg_expr = "${col}[1]"
+        groupby_or_orderby_element_at(true, table_names[0], agg_expr)
+        groupby_or_orderby_element_at(false, table_names[0], agg_expr)
+    }
+
+    //========================== MAP ==========================
+    // insert into doris table
+    ArrayList<String> map_files = [
+                                    "${dataFilePath}/one_level_map.parquet",
+                                    "${dataFilePath}/one_level_map.orc",
+//                                    "${dataFilePath}/one_level_map.json",
+                                    "${dataFilePath}/one_level_map.csv"
+    ]
+    fi = 0
+    for (String f : map_files) {
+        sql "truncate table ${table_names[2]};"

Review Comment:
   table_names[1]



##########
regression-test/suites/datatype_p0/nested_types/base_cases/one_level_nestedtypes_with_s3data.groovy:
##########
@@ -0,0 +1,283 @@
+import org.apache.commons.lang3.StringUtils
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("one_level_nestedtypes_with_s3data") {
+    sql """set enable_nereids_planner=false"""
+    sql """ set enable_fallback_to_original_planner=true;"""
+    // this test case aim to test one-level nested type with s3 data
+
+
+    String ak = getS3AK()
+    String sk = getS3SK()
+    String s3_endpoint = getS3Endpoint()
+    String bucket = context.config.otherConfigs.get("s3BucketName");
+
+
+    def dataFilePath = 
"https://"+"${bucket}"+"."+"${s3_endpoint}"+"/regression/datalake";
+//    def dataFilePath = "/mnt/disk1/wangqiannan/export/ol"
+    def table_names = ["test_array_one_level", "test_map_one_level", 
"test_struct_one_level"]
+
+    def colNameArr = ["c_bool", "c_tinyint", "c_smallint", "c_int", 
"c_bigint", "c_largeint", "c_float",
+                      "c_double", "c_decimal", "c_decimalv3", "c_date", 
"c_datetime", "c_datev2", "c_datetimev2",
+                      "c_char", "c_varchar", "c_string"]
+
+    def groupby_or_orderby_exception = {is_groupby, table_name, col_name ->
+        test {
+            if (is_groupby) {
+                sql "select ${col_name} from ${table_name} group by 
${col_name};"
+            } else {
+                sql "select ${col_name} from ${table_name} order by 
${col_name};"
+            }
+            exception("errCode = 2, detailMessage = Doris hll, bitmap, array, 
map, struct, jsonb, variant column must use with specific function, and don't 
support filter, group by or order by")
+        }
+    }
+
+    def groupby_or_orderby_element_at = {is_groupby, table_name, agg_expr ->
+        if (is_groupby) {
+            order_qt_sql "select ${agg_expr} from ${table_name} where k1 IS 
NOT NULL group by ${agg_expr};"
+        } else {
+            order_qt_sql "select ${agg_expr} from ${table_name} where k1 IS 
NOT NULL order by ${agg_expr} limit 10;"
+        }
+    }
+
+    def be_id = 10139
+    def load_from_tvf = {table_name, uri_file, format ->
+        if (format == "csv") {
+            order_qt_sql_tvf """select * from local(
+                "file_path" = "${uri_file}",
+                "backend_id" = "${be_id}",
+                "column_separator"="|",
+                "format" = "${format}") order by c1 limit 10; """
+
+            sql """
+            insert into ${table_name} select * from local(
+            "file_path" = "${uri_file}",
+            "backend_id" = "${be_id}",
+            "column_separator"="|",
+            "format" = "${format}"); """
+        } else {
+            order_qt_sql_tvf """select * from local(
+                "file_path" = "${uri_file}",
+                "backend_id" = "${be_id}",
+                "column_separator"="|",
+                "format" = "${format}") order by k1 limit 10; """
+
+            sql """
+            insert into ${table_name} select * from local(
+            "file_path" = "${uri_file}",
+            "backend_id" = "${be_id}",
+            "column_separator"="|",
+            "format" = "${format}"); """
+        }
+        // where to filter different format data
+        qt_select_doris """ select * from ${table_name} where k1 IS NOT NULL 
order by k1 limit 10; """
+    }
+    def load_from_s3 = {table_name, uri_file, format ->
+        if (format == "csv") {
+            order_qt_sql_s3 """select * from s3(
+                "uri" = "${uri_file}",
+                    "s3.access_key"= "${ak}",
+                    "s3.secret_key" = "${sk}",
+                    "format" = "${format}",
+                    "column_separator"="|",
+                    "read_json_by_line"="true") order by c1 limit 10; """
+
+            sql """
+            insert into ${table_name} select * from s3(
+            "uri" = "${uri_file}",
+                    "s3.access_key"= "${ak}",
+                    "s3.secret_key" = "${sk}",
+                    "format" = "${format}",
+                    "column_separator"="|",
+                    "read_json_by_line"="true"); """
+        } else {
+            order_qt_sql_s3 """select * from s3(
+                "uri" = "${uri_file}",
+                    "s3.access_key"= "${ak}",
+                    "s3.secret_key" = "${sk}",
+                    "format" = "${format}",
+                    "read_json_by_line"="true") order by k1 limit 10; """
+
+            sql """
+            insert into ${table_name} select * from s3(
+            "uri" = "${uri_file}",
+                    "s3.access_key"= "${ak}",
+                    "s3.secret_key" = "${sk}",
+                    "format" = "${format}",
+                    "read_json_by_line"="true"); """
+        }
+        // where to filter different format data
+        qt_select_doris """ select * from ${table_name} where k1 IS NOT NULL 
order by k1 limit 10; """
+    }
+
+    // step1. create table
+    // step2. load from s3
+    //      step 2.1 format: parquet|orc|json|csv
+    // step3. select *
+    // step4. select element_at(column in first, -1(last), null, 0)
+    // step5. select * from table where element_at(column) equals expr just ok
+    // step6. select * from table where groupby|orderby column
+    // step7. select * from table where groupby|orderby element_at(column)
+
+    def format_order = [
+            "parquet", "orc",
+//            "json",
+            "csv"]
+    // create tables
+    for (int i = 0; i < table_names.size(); ++i) {
+        sql """ DROP TABLE IF EXISTS ${table_names[i]} """
+        String result = create_table_with_nested_type(1, [i], table_names[i])
+        sql result
+    }
+
+    //========================= ARRAY =========================
+    // insert into doris table
+    ArrayList<String> array_files = [
+                                    "${dataFilePath}/one_level_array.parquet",
+                                    "${dataFilePath}/one_level_array.orc",
+//                                    "${dataFilePath}/one_level_array.json",
+                                    "${dataFilePath}/one_level_array.csv"
+    ]
+    int fi = 0
+    for (String f : array_files) {
+        sql "truncate table ${table_names[2]};"

Review Comment:
   table_names[0]



##########
be/src/vec/data_types/serde/data_type_struct_serde.cpp:
##########
@@ -94,106 +93,125 @@ Status 
DataTypeStructSerDe::deserialize_one_cell_from_json(IColumn& column, Slic
         }
         return Status::OK();
     }
-
-    ReadBuffer rb(slice.data, slice.size);
-    ++rb.position();
+    // remove '{' '}'
+    slice.remove_prefix(1);
+    slice.remove_suffix(1);
+    slice.trim_prefix();
 
     bool is_explicit_names = false;
-    std::vector<std::string> field_names;
-    std::vector<ReadBuffer> field_rbs;
-    std::vector<size_t> field_pos;
-
-    while (!rb.eof()) {
-        StringRef slot(rb.position(), rb.count());
-        bool has_quota = false;
-        bool is_name = false;
-        if (!next_slot_from_string(rb, slot, is_name, has_quota)) {
-            return Status::InvalidArgument("Cannot read struct field from text 
'{}'",
-                                           slot.to_string());
-        }
-        if (is_name) {
-            std::string name = slot.to_string();
-            if (!next_slot_from_string(rb, slot, is_name, has_quota)) {
-                return Status::InvalidArgument("Cannot read struct field from 
text '{}'",
-                                               slot.to_string());
-            }
-            ReadBuffer field_rb(const_cast<char*>(slot.data), slot.size);
-            field_names.push_back(name);
-            field_rbs.push_back(field_rb);
-
-            if (!is_explicit_names) {
-                is_explicit_names = true;
+    int nested_level = 0;
+    bool has_quote = false;
+    int start_pos = 0;
+    size_t slice_size = slice.size;
+    bool key_added = false;
+    int idx = 0;
+    char quote_char = 0;
+
+    auto elem_size = elemSerDeSPtrs.size();
+    int field_pos = 0;
+
+    for (; idx < slice_size; ++idx) {
+        char c = slice[idx];
+        if (c == '"' || c == '\'') {
+            if (!has_quote) {
+                quote_char = c;
+                has_quote = !has_quote;
+            } else if (has_quote && quote_char == c) {
+                quote_char = 0;
+                has_quote = !has_quote;
             }
-        } else {
-            ReadBuffer field_rb(const_cast<char*>(slot.data), slot.size);
-            field_rbs.push_back(field_rb);
-        }
-    }
-
-    // TODO: should we support insert default field value when actual field 
number is less than
-    // schema field number?
-    if (field_rbs.size() != elemSerDeSPtrs.size()) {
-        std::string cmp_str = field_rbs.size() > elemSerDeSPtrs.size() ? 
"more" : "less";
-        return Status::InvalidArgument(
-                "Actual struct field number {} is {} than schema field number 
{}.",
-                field_rbs.size(), cmp_str, elemSerDeSPtrs.size());
-    }
-
-    if (is_explicit_names) {
-        if (field_names.size() != field_rbs.size()) {
-            return Status::InvalidArgument(
-                    "Struct field name number {} is not equal to field number 
{}.",
-                    field_names.size(), field_rbs.size());
-        }
-        std::unordered_set<std::string> name_set;
-        for (size_t i = 0; i < field_names.size(); i++) {
-            // check duplicate fields
-            auto ret = name_set.insert(field_names[i]);
-            if (!ret.second) {
-                return Status::InvalidArgument("Struct field name {} is 
duplicate with others.",
-                                               field_names[i]);
+        } else if (c == '\\' && idx + 1 < slice_size) { //escaped
+            ++idx;
+        } else if (!has_quote && (c == '[' || c == '{')) {
+            ++nested_level;
+        } else if (!has_quote && (c == ']' || c == '}')) {
+            --nested_level;
+        } else if (!has_quote && nested_level == 0 && c == 
options.map_key_delim && !key_added) {

Review Comment:
   what's map_key_delim?



##########
regression-test/suites/datatype_p0/nested_types/base_cases/one_level_nestedtypes_with_s3data.groovy:
##########
@@ -0,0 +1,283 @@
+import org.apache.commons.lang3.StringUtils
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("one_level_nestedtypes_with_s3data") {
+    sql """set enable_nereids_planner=false"""
+    sql """ set enable_fallback_to_original_planner=true;"""
+    // this test case aim to test one-level nested type with s3 data
+
+
+    String ak = getS3AK()
+    String sk = getS3SK()
+    String s3_endpoint = getS3Endpoint()
+    String bucket = context.config.otherConfigs.get("s3BucketName");
+
+
+    def dataFilePath = 
"https://"+"${bucket}"+"."+"${s3_endpoint}"+"/regression/datalake";
+//    def dataFilePath = "/mnt/disk1/wangqiannan/export/ol"
+    def table_names = ["test_array_one_level", "test_map_one_level", 
"test_struct_one_level"]
+
+    def colNameArr = ["c_bool", "c_tinyint", "c_smallint", "c_int", 
"c_bigint", "c_largeint", "c_float",

Review Comment:
   colNames



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Re: [PR] [FIX](complextype)fix struct nested complex collection type and and regresstest [doris]

Reply via email to