This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new bbef3ec6903 [enchement](utf8)import enable_text_validate_utf8 session 
var (#45537) (#46071)
bbef3ec6903 is described below

commit bbef3ec6903f33fdef989c25b2486f186d4f0059
Author: daidai <changyu...@selectdb.com>
AuthorDate: Fri Dec 27 16:56:34 2024 +0800

    [enchement](utf8)import enable_text_validate_utf8 session var (#45537) 
(#46071)
    
    bp #46071
---
 be/src/util/utf8_check.cpp                         |   7 ++
 be/src/util/utf8_check.h                           |   4 +
 be/src/vec/exec/format/csv/csv_reader.cpp          |   6 +-
 .../scripts/create_preinstalled_scripts/run72.hql  |  31 +++++++
 .../text/utf8_check/utf8_check_fail.csv            |   5 ++
 .../doris/datasource/hive/source/HiveScanNode.java |   4 +
 .../java/org/apache/doris/qe/SessionVariable.java  |  10 +++
 .../ExternalFileTableValuedFunction.java           |   2 +
 gensrc/thrift/PlanNodes.thrift                     |   2 +
 .../external_table_p0/hive/test_utf8_check.out     |  55 ++++++++++++
 .../external_table_p0/hive/test_utf8_check.groovy  | 100 +++++++++++++++++++++
 11 files changed, 223 insertions(+), 3 deletions(-)

diff --git a/be/src/util/utf8_check.cpp b/be/src/util/utf8_check.cpp
index 5355b901420..f90c27e5e91 100644
--- a/be/src/util/utf8_check.cpp
+++ b/be/src/util/utf8_check.cpp
@@ -327,4 +327,11 @@ bool validate_utf8(const char* src, size_t len) {
     return validate_utf8_naive(src, len);
 }
 #endif
+
+bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t 
len) {
+    if (params.__isset.file_attributes && 
!params.file_attributes.enable_text_validate_utf8) {
+        return true;
+    }
+    return validate_utf8(src, len);
+}
 } // namespace doris
diff --git a/be/src/util/utf8_check.h b/be/src/util/utf8_check.h
index 4214e186b71..7e9b7a2a9de 100644
--- a/be/src/util/utf8_check.h
+++ b/be/src/util/utf8_check.h
@@ -17,6 +17,8 @@
 
 #pragma once
 
+#include <gen_cpp/PlanNodes_types.h>
+
 #include <cstddef>
 
 namespace doris {
@@ -25,4 +27,6 @@ namespace doris {
 bool validate_utf8(const char* src, size_t len);
 // check utf8 use naive c++
 bool validate_utf8_naive(const char* data, size_t len);
+
+bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t 
len);
 } // namespace doris
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index b27bb050dc6..397095590dd 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -713,7 +713,7 @@ Status CsvReader::_fill_empty_line(Block* block, 
std::vector<MutableColumnPtr>&
 }
 
 Status CsvReader::_validate_line(const Slice& line, bool* success) {
-    if (!_is_proto_format && !validate_utf8(line.data, line.size)) {
+    if (!_is_proto_format && !validate_utf8(_params, line.data, line.size)) {
         if (!_is_load) {
             return Status::InternalError<false>("Only support csv data in utf8 
codec");
         } else {
@@ -954,7 +954,7 @@ Status CsvReader::_parse_col_nums(size_t* col_nums) {
         return Status::InternalError<false>(
                 "The first line is empty, can not parse column numbers");
     }
-    if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), 
size)) {
+    if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const 
char*>(ptr)), size)) {
         return Status::InternalError<false>("Only support csv data in utf8 
codec");
     }
     ptr = _remove_bom(ptr, size);
@@ -971,7 +971,7 @@ Status 
CsvReader::_parse_col_names(std::vector<std::string>* col_names) {
     if (size == 0) {
         return Status::InternalError<false>("The first line is empty, can not 
parse column names");
     }
-    if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), 
size)) {
+    if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const 
char*>(ptr)), size)) {
         return Status::InternalError<false>("Only support csv data in utf8 
codec");
     }
     ptr = _remove_bom(ptr, size);
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
new file mode 100644
index 00000000000..1ab754b5042
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
@@ -0,0 +1,31 @@
+CREATE TABLE invalid_utf8_data (
+    id INT,
+    corrupted_data STRING,
+    string_data1 STRING,
+    string_data2 STRING
+)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY ','
+LINES TERMINATED BY '\n'
+location '/user/doris/preinstalled_data/text/utf8_check';
+
+
+CREATE TABLE invalid_utf8_data2 (
+    id INT,
+    corrupted_data STRING,
+    string_data1 STRING,
+    string_data2 STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES (
+    "separatorChar" = ",",
+    "quoteChar" = "\"",
+    "escapeChar" = "\\"
+)
+location '/user/doris/preinstalled_data/text/utf8_check';
+
+
+
+msck repair table invalid_utf8_data;
+msck repair table invalid_utf8_data2;
+
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
new file mode 100644
index 00000000000..391cd493660
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
@@ -0,0 +1,5 @@
+1,�,AAB,helloworld
+2,��,AAB,helloworld
+2,���,AAB,helloworld
+4,����,AAB,helloworld
+5,�����,AAB,helloworld
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index 35b21c368ea..3a8ab722fb6 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -435,6 +435,8 @@ public class HiveScanNode extends FileQueryScanNode {
             textParams.setNullFormat(HiveProperties.getNullFormat(table));
             fileAttributes.setTextParams(textParams);
             fileAttributes.setHeaderType("");
+            fileAttributes.setEnableTextValidateUtf8(
+                    
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
         } else if 
(serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
             TFileTextScanRangeParams textParams = new 
TFileTextScanRangeParams();
             // set set properties of OpenCSVSerde
@@ -451,6 +453,8 @@ public class HiveScanNode extends FileQueryScanNode {
             if (textParams.isSetEnclose()) {
                 fileAttributes.setTrimDoubleQuotes(true);
             }
+            fileAttributes.setEnableTextValidateUtf8(
+                    
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
         } else if (serDeLib.equals("org.apache.hive.hcatalog.data.JsonSerDe")) 
{
             TFileTextScanRangeParams textParams = new 
TFileTextScanRangeParams();
             textParams.setColumnSeparator("\t");
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 53b8423e0fe..ab565defe65 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -690,11 +690,14 @@ public class SessionVariable implements Serializable, 
Writable {
      */
     public static final String ENABLE_AUTO_CREATE_WHEN_OVERWRITE = 
"enable_auto_create_when_overwrite";
 
+
     public static final String 
ENABLE_ADAPTIVE_PIPELINE_TASK_SERIAL_READ_ON_LIMIT =
                                     
"enable_adaptive_pipeline_task_serial_read_on_limit";
     public static final String ADAPTIVE_PIPELINE_TASK_SERIAL_READ_ON_LIMIT =
                                     
"adaptive_pipeline_task_serial_read_on_limit";
 
+    public static final String ENABLE_TEXT_VALIDATE_UTF8 = 
"enable_text_validate_utf8";
+
     /**
      * If set false, user couldn't submit analyze SQL and FE won't allocate 
any related resources.
      */
@@ -2298,6 +2301,13 @@ public class SessionVariable implements Serializable, 
Writable {
     })
     public boolean enableAutoCreateWhenOverwrite = false;
 
+    @VariableMgr.VarAttr(name = ENABLE_TEXT_VALIDATE_UTF8, needForward = true, 
description = {
+            "对于 text 类型的文件读取,是否开启utf8编码检查。非utf8字符会显示成乱码。",
+            "For text type file reading, whether to enable utf8 encoding 
check."
+                    + "non-utf8 characters will be displayed as garbled 
characters."
+    })
+    public boolean enableTextValidateUtf8 = true;
+
     @VariableMgr.VarAttr(name = SKIP_CHECKING_ACID_VERSION_FILE, needForward = 
true, description = {
             "跳过检查 transactional hive 版本文件 '_orc_acid_version.'",
             "Skip checking transactional hive version file 
'_orc_acid_version.'"
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
 
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
index 55d046c2ed9..9031efd0dc2 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
@@ -305,6 +305,8 @@ public abstract class ExternalFileTableValuedFunction 
extends TableValuedFunctio
             fileAttributes.setHeaderType(this.headerType);
             fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes);
             fileAttributes.setSkipLines(skipLines);
+            fileAttributes.setEnableTextValidateUtf8(
+                    
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
         } else if (this.fileFormatType == TFileFormatType.FORMAT_JSON) {
             fileAttributes.setJsonRoot(jsonRoot);
             fileAttributes.setJsonpaths(jsonPaths);
diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift
index 62e88621aeb..9aaa7076901 100644
--- a/gensrc/thrift/PlanNodes.thrift
+++ b/gensrc/thrift/PlanNodes.thrift
@@ -284,6 +284,8 @@ struct TFileAttributes {
     10: optional bool trim_double_quotes;
     // csv skip line num, only used when csv header_type is not set.
     11: optional i32 skip_lines;
+    //For text type file reading, whether to enable utf8 encoding 
check.(Catalog && TVF)
+    12: optional bool enable_text_validate_utf8 = true;
     // for cloud copy into
     1001: optional bool ignore_csv_redundant_col;
 }
diff --git a/regression-test/data/external_table_p0/hive/test_utf8_check.out 
b/regression-test/data/external_table_p0/hive/test_utf8_check.out
new file mode 100644
index 00000000000..7557e789d49
--- /dev/null
+++ b/regression-test/data/external_table_p0/hive/test_utf8_check.out
@@ -0,0 +1,55 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !1 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
+-- !2 --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+c3     text    Yes     false   \N      NONE
+c4     text    Yes     false   \N      NONE
+
+-- !3 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
+-- !4 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
+-- !1 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
+-- !2 --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+c3     text    Yes     false   \N      NONE
+c4     text    Yes     false   \N      NONE
+
+-- !3 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
+-- !4 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
diff --git 
a/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy 
b/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
new file mode 100644
index 00000000000..aa26fdede73
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_utf8_check","p0,external,tvf,hive,external_docker,external_docker_hive")
 {
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("diable Hive test.")
+        return;
+    }
+
+    for (String hivePrefix : ["hive2","hive3"]) {
+    
+        String hms_port = context.config.otherConfigs.get(hivePrefix + 
"HmsPort")
+        String catalog_name = "${hivePrefix}_test_utf8_check"
+        String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+        def hdfsUserName = "doris"
+        String hdfs_port = context.config.otherConfigs.get(hivePrefix + 
"HdfsPort")
+        def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+
+        sql """drop catalog if exists ${catalog_name}"""
+        sql """create catalog if not exists ${catalog_name} properties (
+            "type"="hms",
+            'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+        );"""
+        sql """use `${catalog_name}`.`default`"""
+        
+
+        sql """ set enable_text_validate_utf8 = true; """     
+
+        test {
+            sql """ select * from invalid_utf8_data """ 
+            exception """Only support csv data in utf8 codec"""
+        }
+        
+        
+        test {
+            sql """ select * from invalid_utf8_data2; """ 
+            exception """Only support csv data in utf8 codec"""
+        }
+
+
+        def uri = "${defaultFS}" + 
"/user/doris/preinstalled_data/text/utf8_check/utf8_check_fail.csv"
+
+        
+        test {
+            sql """ desc function  HDFS(
+                "uri" = "${uri}",
+                "hadoop.username" = "${hdfsUserName}",            
+                "format" = "csv",
+                "column_separator"=",")"""    
+            exception """Only support csv data in utf8 codec"""
+        }
+
+        test {
+            sql """select * from HDFS(
+                "uri" = "${uri}",
+                "hadoop.username" = "${hdfsUserName}",            
+                "format" = "csv",
+                "column_separator"=",")"""    
+            exception """Only support csv data in utf8 codec"""
+        }
+    
+
+        sql """ set enable_text_validate_utf8 = false; """     
+
+        qt_1 """select * from invalid_utf8_data order by id """ 
+    
+        qt_2 """ desc function  HDFS(
+                "uri" = "${uri}",
+                "hadoop.username" = "${hdfsUserName}",            
+                "format" = "csv",
+                "column_separator"=",")"""    
+
+
+        qt_3 """select * from   HDFS(
+                "uri" = "${uri}",
+                "hadoop.username" = "${hdfsUserName}",            
+                "format" = "csv",
+                "column_separator"=",") order by c1"""    
+        qt_4 """select * from invalid_utf8_data2 order by id """ 
+    
+    
+    }
+
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to