This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new e30c3f3a65b [fix](csv_reader)fix bug that Read garbled files caused be 
crash. (#24164)
e30c3f3a65b is described below

commit e30c3f3a65bf6eb7887b3128d233376648de3acd
Author: daidai <2017501...@qq.com>
AuthorDate: Wed Sep 13 14:12:55 2023 +0800

    [fix](csv_reader)fix bug that Read garbled files caused be crash. (#24164)
    
    fix bug that read garbled files caused be crash.
---
 be/src/exec/text_converter.cpp                     |   3 +-
 .../vec/data_types/serde/data_type_array_serde.cpp |   9 +++-
 .../vec/data_types/serde/data_type_map_serde.cpp   |  11 +++--
 .../data_types/serde/data_type_struct_serde.cpp    |   5 ++-
 .../hive/test_text_garbled_file.out                | Bin 0 -> 296830 bytes
 .../hive/test_text_garbled_file.groovy             |  46 +++++++++++++++++++++
 6 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/be/src/exec/text_converter.cpp b/be/src/exec/text_converter.cpp
index 95232678ea0..59417bc9265 100644
--- a/be/src/exec/text_converter.cpp
+++ b/be/src/exec/text_converter.cpp
@@ -330,9 +330,10 @@ bool TextConverter::_write_data(const TypeDescriptor& 
type_desc,
                 kv = i;
                 continue;
             }
-            if (i == len || data[i] == _collection_delimiter) {
+            if ((i == len || data[i] == _collection_delimiter) && i >= kv + 1) 
{
                 ranges.push_back({from, kv, i - 1});
                 from = i + 1;
+                kv = from;
             }
         }
 
diff --git a/be/src/vec/data_types/serde/data_type_array_serde.cpp 
b/be/src/vec/data_types/serde/data_type_array_serde.cpp
index 1dca5299894..5aa78a1f886 100644
--- a/be/src/vec/data_types/serde/data_type_array_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_array_serde.cpp
@@ -71,7 +71,9 @@ Status 
DataTypeArraySerDe::deserialize_column_from_json_vector(IColumn& column,
 
 Status DataTypeArraySerDe::deserialize_one_cell_from_json(IColumn& column, 
Slice& slice,
                                                           const FormatOptions& 
options) const {
-    DCHECK(!slice.empty());
+    if (slice.empty()) {
+        return Status::InvalidArgument("slice is empty!");
+    }
     auto& array_column = assert_cast<ColumnArray&>(column);
     auto& offsets = array_column.get_offsets();
     IColumn& nested_column = array_column.get_data();
@@ -132,6 +134,9 @@ Status 
DataTypeArraySerDe::deserialize_one_cell_from_json(IColumn& column, Slice
 Status DataTypeArraySerDe::deserialize_one_cell_from_hive_text(IColumn& 
column, Slice& slice,
                                                                const 
FormatOptions& options,
                                                                int 
nesting_level) const {
+    if (slice.empty()) {
+        return Status::InvalidArgument("slice is empty!");
+    }
     auto& array_column = assert_cast<ColumnArray&>(column);
     auto& offsets = array_column.get_offsets();
     IColumn& nested_column = array_column.get_data();
@@ -303,4 +308,4 @@ Status DataTypeArraySerDe::write_column_to_mysql(const 
IColumn& column,
 }
 
 } // namespace vectorized
-} // namespace doris
\ No newline at end of file
+} // namespace doris
diff --git a/be/src/vec/data_types/serde/data_type_map_serde.cpp 
b/be/src/vec/data_types/serde/data_type_map_serde.cpp
index a0e6636c507..cce5986b195 100644
--- a/be/src/vec/data_types/serde/data_type_map_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_map_serde.cpp
@@ -65,7 +65,9 @@ void DataTypeMapSerDe::serialize_one_cell_to_json(const 
IColumn& column, int row
 Status DataTypeMapSerDe::deserialize_one_cell_from_hive_text(IColumn& column, 
Slice& slice,
                                                              const 
FormatOptions& options,
                                                              int 
nesting_level) const {
-    DCHECK(!slice.empty());
+    if (slice.empty()) {
+        return Status::InvalidArgument("slice is empty!");
+    }
     auto& array_column = assert_cast<ColumnMap&>(column);
     auto& offsets = array_column.get_offsets();
     IColumn& nested_key_column = array_column.get_keys();
@@ -92,10 +94,11 @@ Status 
DataTypeMapSerDe::deserialize_one_cell_from_hive_text(IColumn& column, Sl
             kv = i;
             continue;
         }
-        if (i == slice.size || slice[i] == collection_delimiter) {
+        if ((i == slice.size || slice[i] == collection_delimiter) && i >= kv + 
1) {
             key_slices.push_back({slice.data + from, kv - from});
             value_slices.push_back({slice.data + kv + 1, i - 1 - kv});
             from = i + 1;
+            kv = from;
         }
     }
 
@@ -169,7 +172,9 @@ Status 
DataTypeMapSerDe::deserialize_column_from_json_vector(IColumn& column,
 
 Status DataTypeMapSerDe::deserialize_one_cell_from_json(IColumn& column, 
Slice& slice,
                                                         const FormatOptions& 
options) const {
-    DCHECK(!slice.empty());
+    if (slice.empty()) {
+        return Status::InvalidArgument("slice is empty!");
+    }
     auto& array_column = assert_cast<ColumnMap&>(column);
     auto& offsets = array_column.get_offsets();
     IColumn& nested_key_column = array_column.get_keys();
diff --git a/be/src/vec/data_types/serde/data_type_struct_serde.cpp 
b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
index b202d0fb237..06ec4d709bb 100644
--- a/be/src/vec/data_types/serde/data_type_struct_serde.cpp
+++ b/be/src/vec/data_types/serde/data_type_struct_serde.cpp
@@ -44,6 +44,9 @@ void DataTypeStructSerDe::write_one_cell_to_jsonb(const 
IColumn& column, JsonbWr
 Status DataTypeStructSerDe::deserialize_one_cell_from_hive_text(IColumn& 
column, Slice& slice,
                                                                 const 
FormatOptions& options,
                                                                 int 
nesting_level) const {
+    if (slice.empty()) {
+        return Status::InvalidArgument("slice is empty!");
+    }
     char struct_delimiter = options.get_collection_delimiter(nesting_level);
 
     std::vector<Slice> slices;
@@ -190,4 +193,4 @@ Status DataTypeStructSerDe::write_column_to_mysql(const 
IColumn& column,
 }
 
 } // namespace vectorized
-} // namespace doris
\ No newline at end of file
+} // namespace doris
diff --git 
a/regression-test/data/external_table_p2/hive/test_text_garbled_file.out 
b/regression-test/data/external_table_p2/hive/test_text_garbled_file.out
new file mode 100644
index 00000000000..b003cd49e97
Binary files /dev/null and 
b/regression-test/data/external_table_p2/hive/test_text_garbled_file.out differ
diff --git 
a/regression-test/suites/external_table_p2/hive/test_text_garbled_file.groovy 
b/regression-test/suites/external_table_p2/hive/test_text_garbled_file.groovy
new file mode 100644
index 00000000000..a3ea6a3bcc2
--- /dev/null
+++ 
b/regression-test/suites/external_table_p2/hive/test_text_garbled_file.groovy
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_text_garbled_file", 
"p2,external,hive,external_remote,external_remote_hive") {
+    //test hive garbled files  , prevent be hanged
+
+    String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        String extHiveHmsHost = 
context.config.otherConfigs.get("extHiveHmsHost")
+        String extHiveHmsPort = 
context.config.otherConfigs.get("extHiveHmsPort")
+        String catalog_name = "test_text_garbled_file"
+        sql """drop catalog if exists ${catalog_name};"""
+        sql """
+            create catalog if not exists ${catalog_name} properties (
+                'type'='hms',
+                'hadoop.username' = 'hadoop',
+                'hive.metastore.uris' = 
'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
+            );
+        """
+        logger.info("catalog " + catalog_name + " created")
+        sql """switch ${catalog_name};"""
+        logger.info("switched to catalog " + catalog_name)
+
+            
+        order_qt_garbled_file """
+        select * from ${catalog_name}.multi_catalog.test_csv_format_error;     
   
+        """ 
+
+        
+    }
+}
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to