This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new adfef1d790e [fix](ES Catalog)Check isArray before parse json to array 
(#39104) (#39275)
adfef1d790e is described below

commit adfef1d790e2474fabad98a68f8bceaa1ebdec73
Author: qiye <jianliang5...@gmail.com>
AuthorDate: Wed Aug 14 15:21:27 2024 +0800

    [fix](ES Catalog)Check isArray before parse json to array (#39104) (#39275)
---
 be/src/exec/es/es_scroll_parser.cpp                | 25 +++++-----
 .../scripts/data/composite_type_array_bulk.json    | 10 ++++
 .../elasticsearch/scripts/es_init.sh               | 58 ++++++++++++++++++++++
 .../index/array_meta_composite_type_array.json     |  9 ++++
 .../scripts/index/es6_composite_type_array.json    | 14 ++++++
 .../scripts/index/es7_composite_type_array.json    | 12 +++++
 .../external_table_p0/es/test_es_query.groovy      | 25 ++++++++++
 7 files changed, 141 insertions(+), 12 deletions(-)

diff --git a/be/src/exec/es/es_scroll_parser.cpp 
b/be/src/exec/es/es_scroll_parser.cpp
index 138bbb81bc0..22c9d3ee337 100644
--- a/be/src/exec/es/es_scroll_parser.cpp
+++ b/be/src/exec/es/es_scroll_parser.cpp
@@ -100,9 +100,9 @@ static const std::string ERROR_COL_DATA_IS_ARRAY =
 static const std::string INVALID_NULL_VALUE =
         "Invalid null value occurs: Non-null column `$0` contains NULL";
 
-#define RETURN_ERROR_IF_COL_IS_ARRAY(col, type)                              \
+#define RETURN_ERROR_IF_COL_IS_ARRAY(col, type, is_array)                    \
     do {                                                                     \
-        if (col.IsArray()) {                                                 \
+        if (col.IsArray() == is_array) {                                     \
             std::stringstream ss;                                            \
             ss << "Expected value of type: " << type_to_string(type)         \
                << "; but found type: " << json_type_to_string(col.GetType()) \
@@ -167,7 +167,7 @@ Status get_int_value(const rapidjson::Value& col, 
PrimitiveType type, void* slot
         return Status::OK();
     }
 
-    RETURN_ERROR_IF_COL_IS_ARRAY(col, type);
+    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
     RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
 
     StringParser::ParseResult result;
@@ -294,7 +294,7 @@ Status get_date_int(const rapidjson::Value& col, 
PrimitiveType type, bool pure_d
         return get_date_value_int<T, RT>(col[0], type, false, slot, time_zone);
     } else {
         // this would happened just only when `enable_docvalue_scan = false`, 
and field has string format date from _source
-        RETURN_ERROR_IF_COL_IS_ARRAY(col, type);
+        RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
         RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
         return get_date_value_int<T, RT>(col, type, true, slot, time_zone);
     }
@@ -322,7 +322,7 @@ Status get_float_value(const rapidjson::Value& col, 
PrimitiveType type, void* sl
         return Status::OK();
     }
 
-    RETURN_ERROR_IF_COL_IS_ARRAY(col, type);
+    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
     RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
 
     StringParser::ParseResult result;
@@ -351,7 +351,7 @@ Status insert_float_value(const rapidjson::Value& col, 
PrimitiveType type,
         return Status::OK();
     }
 
-    RETURN_ERROR_IF_COL_IS_ARRAY(col, type);
+    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
     RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
 
     StringParser::ParseResult result;
@@ -390,7 +390,7 @@ Status insert_int_value(const rapidjson::Value& col, 
PrimitiveType type,
         return Status::OK();
     }
 
-    RETURN_ERROR_IF_COL_IS_ARRAY(col, type);
+    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
     RETURN_ERROR_IF_COL_IS_NOT_STRING(col, type);
 
     StringParser::ParseResult result;
@@ -543,7 +543,7 @@ Status ScrollParser::fill_columns(const TupleDescriptor* 
tuple_desc,
                     val = col[0].GetString();
                 }
             } else {
-                RETURN_ERROR_IF_COL_IS_ARRAY(col, type);
+                RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
                 if (!col.IsString()) {
                     val = json_value_to_string(col);
                 } else {
@@ -618,7 +618,7 @@ Status ScrollParser::fill_columns(const TupleDescriptor* 
tuple_desc,
 
             const rapidjson::Value& str_col = is_nested_str ? col[0] : col;
 
-            RETURN_ERROR_IF_COL_IS_ARRAY(col, type);
+            RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
 
             const std::string& val = str_col.GetString();
             size_t val_size = str_col.GetStringLength();
@@ -644,7 +644,7 @@ Status ScrollParser::fill_columns(const TupleDescriptor* 
tuple_desc,
                         val = col[0].GetString();
                     }
                 } else {
-                    RETURN_ERROR_IF_COL_IS_ARRAY(col, type);
+                    RETURN_ERROR_IF_COL_IS_ARRAY(col, type, true);
                     if (!col.IsString()) {
                         val = json_value_to_string(col);
                     } else {
@@ -676,13 +676,14 @@ Status ScrollParser::fill_columns(const TupleDescriptor* 
tuple_desc,
         case TYPE_ARRAY: {
             vectorized::Array array;
             const auto& sub_type = 
tuple_desc->slots()[i]->type().children[0].type;
-            for (auto& sub_col : col.GetArray()) {
+            RETURN_ERROR_IF_COL_IS_ARRAY(col, type, false);
+            for (const auto& sub_col : col.GetArray()) {
                 switch (sub_type) {
                 case TYPE_CHAR:
                 case TYPE_VARCHAR:
                 case TYPE_STRING: {
                     std::string val;
-                    RETURN_ERROR_IF_COL_IS_ARRAY(sub_col, sub_type);
+                    RETURN_ERROR_IF_COL_IS_ARRAY(sub_col, sub_type, true);
                     if (!sub_col.IsString()) {
                         val = json_value_to_string(sub_col);
                     } else {
diff --git 
a/docker/thirdparties/docker-compose/elasticsearch/scripts/data/composite_type_array_bulk.json
 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/composite_type_array_bulk.json
new file mode 100755
index 00000000000..8654ab25e6f
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/composite_type_array_bulk.json
@@ -0,0 +1,10 @@
+{"name": "Andy", "sports": "soccer"}
+{"name": "Betty", "sports": "pingpong ball"}
+{"name": "Cindy", "sports": "武术"}
+{"name": "David", "sports": ["volleyball"]}
+{"name": "Emily", "sports": ["baseball", "golf", "hockey"]}
+{"name": "Frank", "sports": ["rugby", "cricket", "boxing"]}
+{"name": "Grace", "sports": ["table tennis", "badminton", "athletics"]}
+{"name": "Henry", "sports": ["archery", "fencing", "weightlifting"]}
+{"name": "Ivy", "sports": ["judo", "karate", "taekwondo"]}
+{"name": "Jack", "sports": ["wrestling", "gymnastics", "surfing"]}
\ No newline at end of file
diff --git 
a/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh
index 575118d8db9..258c6c83ef5 100755
--- a/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh
+++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh
@@ -16,6 +16,30 @@
 # specific language governing permissions and limitations
 # under the License.
 
+generate_bulk_request() {
+    local index_name=$1
+    local type_value=$2
+    local id_prefix=$3
+    local data_file=$4
+    local output_file=$5
+
+    // clear output file
+    echo "" > "$output_file"
+
+    local id=1
+    while IFS= read -r line; do
+        if [ -n "$type_value" ]; then
+            echo "{\"index\": {\"_index\": \"$index_name\", \"_type\": 
\"$type_value\", \"_id\": \"${id_prefix}${id}\"}}"  >> "$output_file"
+        else
+            echo "{\"index\": {\"_index\": \"$index_name\", \"_id\": 
\"${id_prefix}${id}\"}}"  >> "$output_file"
+        fi
+        echo "$line"  >> "$output_file"
+        id=$((id + 1))
+    done < "$data_file"
+}
+
+array_data_file="/mnt/scripts/data/composite_type_array_bulk.json"
+
 # es 5
 # create index test1
 # shellcheck disable=SC2154
@@ -43,6 +67,14 @@ curl "http://${ES_5_HOST}:9200/test2_20220808/doc/_mapping"; 
-H "Content-Type:app
 curl "http://${ES_5_HOST}:9200/test2_20220809/doc/_mapping"; -H 
"Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json"
 # create index .hide
 curl "http://${ES_5_HOST}:9200/.hide"; -H "Content-Type:application/json" -X 
PUT -d "@/mnt/scripts/index/es6_hide.json"
+# create index composite_type_array
+curl "http://${ES_5_HOST}:9200/composite_type_array"; -H 
"Content-Type:application/json" -X PUT -d 
"@/mnt/scripts/index/es6_composite_type_array.json"
+# put data with bulk for composite_type_array
+bulk_request_file="/mnt/scripts/data/bulk_request_es5.json"
+generate_bulk_request "composite_type_array" "doc" "item_" "$array_data_file" 
"$bulk_request_file"
+curl -X POST "http://${ES_5_HOST}:9200/_bulk"; --data-binary 
"@$bulk_request_file" -H "Content-Type: application/json"
+# put _meta for composite_type_array
+curl "http://${ES_5_HOST}:9200/composite_type_array/doc/_mapping"; -H 
"Content-Type:application/json" -X PUT -d 
"@/mnt/scripts/index/array_meta_composite_type_array.json"
 
 # es 6
 # create index test1
@@ -70,6 +102,14 @@ curl "http://${ES_6_HOST}:9200/test2_20220808/doc/_mapping"; 
-H "Content-Type:app
 curl "http://${ES_6_HOST}:9200/test2_20220809/doc/_mapping"; -H 
"Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json"
 # create index .hide
 curl "http://${ES_6_HOST}:9200/.hide"; -H "Content-Type:application/json" -X 
PUT -d "@/mnt/scripts/index/es6_hide.json"
+# create index composite_type_array
+curl "http://${ES_6_HOST}:9200/composite_type_array"; -H 
"Content-Type:application/json" -X PUT -d 
"@/mnt/scripts/index/es6_composite_type_array.json"
+# put data with bulk for composite_type_array
+bulk_request_file="/mnt/scripts/data/bulk_request_es6.json"
+generate_bulk_request "composite_type_array" "doc" "item_" "$array_data_file" 
"$bulk_request_file"
+curl -X POST "http://${ES_6_HOST}:9200/_bulk"; --data-binary 
"@$bulk_request_file" -H "Content-Type: application/json"
+# put _meta for composite_type_array
+curl "http://${ES_6_HOST}:9200/composite_type_array/doc/_mapping"; -H 
"Content-Type:application/json" -X PUT -d 
"@/mnt/scripts/index/array_meta_composite_type_array.json"
 
 # es7
 # create index test1
@@ -106,6 +146,15 @@ curl "http://${ES_7_HOST}:9200/test2_20220809/_mapping"; -H 
"Content-Type:applica
 # create index .hide
 curl "http://${ES_7_HOST}:9200/.hide"; -H "Content-Type:application/json" -X 
PUT -d "@/mnt/scripts/index/es7_hide.json"
 
+# create index composite_type_array
+curl "http://${ES_7_HOST}:9200/composite_type_array"; -H 
"Content-Type:application/json" -X PUT -d 
"@/mnt/scripts/index/es7_composite_type_array.json"
+# put data with bulk for composite_type_array
+bulk_request_file="/mnt/scripts/data/bulk_request_es7.json"
+generate_bulk_request "composite_type_array" "_doc" "item_" "$array_data_file" 
"$bulk_request_file"
+curl -X POST "http://${ES_7_HOST}:9200/_bulk"; --data-binary 
"@$bulk_request_file" -H "Content-Type: application/json"
+# put _meta for composite_type_array
+curl "http://${ES_7_HOST}:9200/composite_type_array/_mapping"; -H 
"Content-Type:application/json" -X PUT -d 
"@/mnt/scripts/index/array_meta_composite_type_array.json"
+
 # es8
 # create index test1
 curl "http://${ES_8_HOST}:9200/test1"; -H "Content-Type:application/json" -X 
PUT -d "@/mnt/scripts/index/es7_test1.json"
@@ -138,3 +187,12 @@ curl "http://${ES_8_HOST}:9200/test3_20231005/_doc/1"; -H 
"Content-Type:applicati
 curl "http://${ES_8_HOST}:9200/test1/_mapping"; -H 
"Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json"
 curl "http://${ES_8_HOST}:9200/test2_20220808/_mapping"; -H 
"Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json"
 curl "http://${ES_8_HOST}:9200/test2_20220809/_mapping"; -H 
"Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json"
+
+# create index composite_type_array
+curl "http://${ES_8_HOST}:9200/composite_type_array"; -H 
"Content-Type:application/json" -X PUT -d 
"@/mnt/scripts/index/es7_composite_type_array.json"
+# put data with bulk for composite_type_array
+bulk_request_file="/mnt/scripts/data/bulk_request_es8.json"
+generate_bulk_request "composite_type_array" "" "item_" "$array_data_file" 
"$bulk_request_file"
+curl -X POST "http://${ES_8_HOST}:9200/_bulk"; --data-binary 
"@$bulk_request_file" -H "Content-Type: application/json"
+# put _meta for composite_type_array
+curl "http://${ES_8_HOST}:9200/composite_type_array/_mapping"; -H 
"Content-Type:application/json" -X PUT -d 
"@/mnt/scripts/index/array_meta_composite_type_array.json"
\ No newline at end of file
diff --git 
a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/array_meta_composite_type_array.json
 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/array_meta_composite_type_array.json
new file mode 100644
index 00000000000..9c55390149d
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/array_meta_composite_type_array.json
@@ -0,0 +1,9 @@
+{
+  "_meta": {
+    "doris":{
+      "array_fields":[
+        "sports"
+      ]
+    }
+  }
+}
diff --git 
a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_composite_type_array.json
 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_composite_type_array.json
new file mode 100755
index 00000000000..2921628cf38
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_composite_type_array.json
@@ -0,0 +1,14 @@
+{
+  "settings": {
+    "number_of_shards": 1,
+    "number_of_replicas": 0
+  },
+  "mappings": {
+    "doc": {
+      "properties": {
+        "name": { "type": "keyword" },
+        "sports": { "type": "keyword", "doc_values": false}
+      }
+    }
+  }
+}
diff --git 
a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_composite_type_array.json
 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_composite_type_array.json
new file mode 100644
index 00000000000..8fdd6c88ce7
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_composite_type_array.json
@@ -0,0 +1,12 @@
+{
+  "settings": {
+    "number_of_shards": 1,
+    "number_of_replicas": 0
+  },
+  "mappings": {
+    "properties": {
+      "name": { "type": "keyword" },
+      "sports": { "type": "keyword", "doc_values": false}
+    }
+  }
+}
diff --git a/regression-test/suites/external_table_p0/es/test_es_query.groovy 
b/regression-test/suites/external_table_p0/es/test_es_query.groovy
index b3b852831de..6d3c87a7455 100644
--- a/regression-test/suites/external_table_p0/es/test_es_query.groovy
+++ b/regression-test/suites/external_table_p0/es/test_es_query.groovy
@@ -214,6 +214,13 @@ suite("test_es_query", 
"p0,external,es,external_docker,external_docker_es") {
             order_qt_sql_5_18 """select message from test1 where 
not_null_or_empty(message)"""
             order_qt_sql_5_19 """select * from test1 where 
esquery(c_unsigned_long, '{"match":{"c_unsigned_long":0}}')"""
             order_qt_sql_5_20 """select c_person, c_user, 
json_extract(c_person, '\$.[0].name'), json_extract(c_user, '\$.[1].last') from 
test1;"""
+            try {
+                sql """select * from composite_type_array;"""
+                fail("Should not reach here")
+            } catch (Exception e) {
+                logger.error(e.getMessage())
+                assertTrue(e.getMessage().contains("Expected value of type: 
ARRAY; but found type: Varchar/Char; Document slice is : \"soccer\""))
+            }
 
             sql """switch test_es_query_es6"""
             // order_qt_sql_6_01 """show tables"""
@@ -236,6 +243,12 @@ suite("test_es_query", 
"p0,external,es,external_docker,external_docker_es") {
             order_qt_sql_6_18 """select message from test1 where 
not_null_or_empty(message)"""
             order_qt_sql_6_19 """select * from test1 where esquery(c_person, 
'{"match":{"c_person.name":"Andy"}}')"""
             order_qt_sql_6_20 """select c_person, c_user, 
json_extract(c_person, '\$.[0].name'), json_extract(c_user, '\$.[1].last') from 
test1;"""
+            try {
+                sql """select * from composite_type_array;"""
+                fail("Should not reach here")
+            } catch (Exception e) {
+                assertTrue(e.getMessage().contains("Expected value of type: 
ARRAY; but found type: Varchar/Char; Document slice is : \"soccer\""))
+            }
 
             List<List<String>> tables6N = sql """show tables"""
             boolean notContainHide = true
@@ -282,6 +295,12 @@ suite("test_es_query", 
"p0,external,es,external_docker,external_docker_es") {
             order_qt_sql_7_23 """select * from test1 where level = 'debug'"""
             order_qt_sql_7_24 """select * from test1 where esquery(c_float, 
'{"match":{"c_float":1.1}}')"""
             order_qt_sql_7_25 """select c_person, c_user, 
json_extract(c_person, '\$.[0].name'), json_extract(c_user, '\$.[1].last') from 
test1;"""
+            try {
+                sql """select * from composite_type_array;"""
+                fail("Should not reach here")
+            } catch (Exception e) {
+                assertTrue(e.getMessage().contains("Expected value of type: 
ARRAY; but found type: Varchar/Char; Document slice is : \"soccer\""))
+            }
 
             List<List<String>> tables7N = sql """show tables"""
             boolean notContainHide7 = true
@@ -328,6 +347,12 @@ suite("test_es_query", 
"p0,external,es,external_docker,external_docker_es") {
             order_qt_sql_8_21 """select * from test1 where level = 'debug'"""
             order_qt_sql_8_22 """select * from test1 where esquery(c_ip, 
'{"match":{"c_ip":"192.168.0.1"}}')"""
             order_qt_sql_8_23 """select c_person, c_user, 
json_extract(c_person, '\$.[0].name'), json_extract(c_user, '\$.[1].last') from 
test1;"""
+            try {
+                sql """select * from composite_type_array;"""
+                fail("Should not reach here")
+            } catch (Exception e) {
+                assertTrue(e.getMessage().contains("Expected value of type: 
ARRAY; but found type: Varchar/Char; Document slice is : \"soccer\""))
+            }
         
         }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to