This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new a380f5d2228 [enchement](utf8)import enable_text_validate_utf8 session 
var (#45537) (#46070)
a380f5d2228 is described below

commit a380f5d2228517492b863e6998c05093842b599f
Author: daidai <changyu...@selectdb.com>
AuthorDate: Sat Dec 28 10:05:03 2024 +0800

    [enchement](utf8)import enable_text_validate_utf8 session var (#45537) 
(#46070)
    
    bp #45537
---
 be/src/util/utf8_check.cpp                         |   7 ++
 be/src/util/utf8_check.h                           |   4 +
 be/src/vec/exec/format/csv/csv_reader.cpp          |   6 +-
 .../scripts/create_preinstalled_scripts/run72.hql  |  31 +++++++
 .../text/utf8_check/utf8_check_fail.csv            |   5 ++
 .../doris/datasource/hive/source/HiveScanNode.java |   2 +
 .../java/org/apache/doris/qe/SessionVariable.java  |   9 ++
 .../ExternalFileTableValuedFunction.java           |   2 +
 gensrc/thrift/PlanNodes.thrift                     |   2 +
 .../external_table_p0/hive/test_utf8_check.out     |  55 ++++++++++++
 .../external_table_p0/hive/test_utf8_check.groovy  | 100 +++++++++++++++++++++
 11 files changed, 220 insertions(+), 3 deletions(-)

diff --git a/be/src/util/utf8_check.cpp b/be/src/util/utf8_check.cpp
index 5355b901420..f90c27e5e91 100644
--- a/be/src/util/utf8_check.cpp
+++ b/be/src/util/utf8_check.cpp
@@ -327,4 +327,11 @@ bool validate_utf8(const char* src, size_t len) {
     return validate_utf8_naive(src, len);
 }
 #endif
+
+bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t 
len) {
+    if (params.__isset.file_attributes && 
!params.file_attributes.enable_text_validate_utf8) {
+        return true;
+    }
+    return validate_utf8(src, len);
+}
 } // namespace doris
diff --git a/be/src/util/utf8_check.h b/be/src/util/utf8_check.h
index 4214e186b71..7e9b7a2a9de 100644
--- a/be/src/util/utf8_check.h
+++ b/be/src/util/utf8_check.h
@@ -17,6 +17,8 @@
 
 #pragma once
 
+#include <gen_cpp/PlanNodes_types.h>
+
 #include <cstddef>
 
 namespace doris {
@@ -25,4 +27,6 @@ namespace doris {
 bool validate_utf8(const char* src, size_t len);
 // check utf8 use naive c++
 bool validate_utf8_naive(const char* data, size_t len);
+
+bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t 
len);
 } // namespace doris
diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp 
b/be/src/vec/exec/format/csv/csv_reader.cpp
index 1fc3bbad294..77a5b65d512 100644
--- a/be/src/vec/exec/format/csv/csv_reader.cpp
+++ b/be/src/vec/exec/format/csv/csv_reader.cpp
@@ -715,7 +715,7 @@ Status CsvReader::_fill_empty_line(Block* block, 
std::vector<MutableColumnPtr>&
 }
 
 Status CsvReader::_validate_line(const Slice& line, bool* success) {
-    if (!_is_proto_format && !validate_utf8(line.data, line.size)) {
+    if (!_is_proto_format && !validate_utf8(_params, line.data, line.size)) {
         if (!_is_load) {
             return Status::InternalError<false>("Only support csv data in utf8 
codec");
         } else {
@@ -951,7 +951,7 @@ Status CsvReader::_parse_col_nums(size_t* col_nums) {
         return Status::InternalError<false>(
                 "The first line is empty, can not parse column numbers");
     }
-    if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), 
size)) {
+    if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const 
char*>(ptr)), size)) {
         return Status::InternalError<false>("Only support csv data in utf8 
codec");
     }
     ptr = _remove_bom(ptr, size);
@@ -968,7 +968,7 @@ Status 
CsvReader::_parse_col_names(std::vector<std::string>* col_names) {
     if (size == 0) {
         return Status::InternalError<false>("The first line is empty, can not 
parse column names");
     }
-    if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), 
size)) {
+    if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const 
char*>(ptr)), size)) {
         return Status::InternalError<false>("Only support csv data in utf8 
codec");
     }
     ptr = _remove_bom(ptr, size);
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
new file mode 100644
index 00000000000..1ab754b5042
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql
@@ -0,0 +1,31 @@
+CREATE TABLE invalid_utf8_data (
+    id INT,
+    corrupted_data STRING,
+    string_data1 STRING,
+    string_data2 STRING
+)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY ','
+LINES TERMINATED BY '\n'
+location '/user/doris/preinstalled_data/text/utf8_check';
+
+
+CREATE TABLE invalid_utf8_data2 (
+    id INT,
+    corrupted_data STRING,
+    string_data1 STRING,
+    string_data2 STRING
+)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES (
+    "separatorChar" = ",",
+    "quoteChar" = "\"",
+    "escapeChar" = "\\"
+)
+location '/user/doris/preinstalled_data/text/utf8_check';
+
+
+
+msck repair table invalid_utf8_data;
+msck repair table invalid_utf8_data2;
+
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
new file mode 100644
index 00000000000..391cd493660
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv
@@ -0,0 +1,5 @@
+1,�,AAB,helloworld
+2,��,AAB,helloworld
+2,���,AAB,helloworld
+4,����,AAB,helloworld
+5,�����,AAB,helloworld
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index 02906494b03..dcabd11358c 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -452,6 +452,8 @@ public class HiveScanNode extends FileQueryScanNode {
         TFileAttributes fileAttributes = new TFileAttributes();
         fileAttributes.setTextParams(textParams);
         fileAttributes.setHeaderType("");
+        fileAttributes.setEnableTextValidateUtf8(
+                
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
         if (textParams.isSet(TFileTextScanRangeParams._Fields.ENCLOSE)) {
             fileAttributes.setTrimDoubleQuotes(true);
         }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index e9dd9ec5822..f996b538257 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -665,6 +665,8 @@ public class SessionVariable implements Serializable, 
Writable {
      */
     public static final String ENABLE_AUTO_CREATE_WHEN_OVERWRITE = 
"enable_auto_create_when_overwrite";
 
+    public static final String ENABLE_TEXT_VALIDATE_UTF8 = 
"enable_text_validate_utf8";
+
     /**
      * If set false, user couldn't submit analyze SQL and FE won't allocate 
any related resources.
      */
@@ -2219,6 +2221,13 @@ public class SessionVariable implements Serializable, 
Writable {
     })
     public boolean enableAutoCreateWhenOverwrite = false;
 
+    @VariableMgr.VarAttr(name = ENABLE_TEXT_VALIDATE_UTF8, needForward = true, 
description = {
+            "对于 text 类型的文件读取,是否开启utf8编码检查。非utf8字符会显示成乱码。",
+            "For text type file reading, whether to enable utf8 encoding 
check."
+                    + "non-utf8 characters will be displayed as garbled 
characters."
+    })
+    public boolean enableTextValidateUtf8 = true;
+
     @VariableMgr.VarAttr(name = SKIP_CHECKING_ACID_VERSION_FILE, needForward = 
true, description = {
             "跳过检查 transactional hive 版本文件 '_orc_acid_version.'",
             "Skip checking transactional hive version file 
'_orc_acid_version.'"
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
 
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
index 1f65921832b..cb1a2d89c5d 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java
@@ -304,6 +304,8 @@ public abstract class ExternalFileTableValuedFunction 
extends TableValuedFunctio
             fileAttributes.setHeaderType(this.headerType);
             fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes);
             fileAttributes.setSkipLines(skipLines);
+            fileAttributes.setEnableTextValidateUtf8(
+                    
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
         } else if (this.fileFormatType == TFileFormatType.FORMAT_JSON) {
             fileAttributes.setJsonRoot(jsonRoot);
             fileAttributes.setJsonpaths(jsonPaths);
diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift
index 1b873787765..7ccb12b3331 100644
--- a/gensrc/thrift/PlanNodes.thrift
+++ b/gensrc/thrift/PlanNodes.thrift
@@ -284,6 +284,8 @@ struct TFileAttributes {
     10: optional bool trim_double_quotes;
     // csv skip line num, only used when csv header_type is not set.
     11: optional i32 skip_lines;
+    //For text type file reading, whether to enable utf8 encoding 
check.(Catalog && TVF)
+    12: optional bool enable_text_validate_utf8 = true;
 }
 
 struct TIcebergDeleteFileDesc {
diff --git a/regression-test/data/external_table_p0/hive/test_utf8_check.out 
b/regression-test/data/external_table_p0/hive/test_utf8_check.out
new file mode 100644
index 00000000000..7557e789d49
--- /dev/null
+++ b/regression-test/data/external_table_p0/hive/test_utf8_check.out
@@ -0,0 +1,55 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !1 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
+-- !2 --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+c3     text    Yes     false   \N      NONE
+c4     text    Yes     false   \N      NONE
+
+-- !3 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
+-- !4 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
+-- !1 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
+-- !2 --
+c1     text    Yes     false   \N      NONE
+c2     text    Yes     false   \N      NONE
+c3     text    Yes     false   \N      NONE
+c4     text    Yes     false   \N      NONE
+
+-- !3 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
+-- !4 --
+1      �       AAB     helloworld
+2      ��      AAB     helloworld
+2      ���     AAB     helloworld
+4      ����    AAB     helloworld
+5      �����   AAB     helloworld
+
diff --git 
a/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy 
b/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
new file mode 100644
index 00000000000..aa26fdede73
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_utf8_check","p0,external,tvf,hive,external_docker,external_docker_hive")
 {
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("diable Hive test.")
+        return;
+    }
+
+    for (String hivePrefix : ["hive2","hive3"]) {
+    
+        String hms_port = context.config.otherConfigs.get(hivePrefix + 
"HmsPort")
+        String catalog_name = "${hivePrefix}_test_utf8_check"
+        String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+        def hdfsUserName = "doris"
+        String hdfs_port = context.config.otherConfigs.get(hivePrefix + 
"HdfsPort")
+        def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+
+        sql """drop catalog if exists ${catalog_name}"""
+        sql """create catalog if not exists ${catalog_name} properties (
+            "type"="hms",
+            'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+        );"""
+        sql """use `${catalog_name}`.`default`"""
+        
+
+        sql """ set enable_text_validate_utf8 = true; """     
+
+        test {
+            sql """ select * from invalid_utf8_data """ 
+            exception """Only support csv data in utf8 codec"""
+        }
+        
+        
+        test {
+            sql """ select * from invalid_utf8_data2; """ 
+            exception """Only support csv data in utf8 codec"""
+        }
+
+
+        def uri = "${defaultFS}" + 
"/user/doris/preinstalled_data/text/utf8_check/utf8_check_fail.csv"
+
+        
+        test {
+            sql """ desc function  HDFS(
+                "uri" = "${uri}",
+                "hadoop.username" = "${hdfsUserName}",            
+                "format" = "csv",
+                "column_separator"=",")"""    
+            exception """Only support csv data in utf8 codec"""
+        }
+
+        test {
+            sql """select * from HDFS(
+                "uri" = "${uri}",
+                "hadoop.username" = "${hdfsUserName}",            
+                "format" = "csv",
+                "column_separator"=",")"""    
+            exception """Only support csv data in utf8 codec"""
+        }
+    
+
+        sql """ set enable_text_validate_utf8 = false; """     
+
+        qt_1 """select * from invalid_utf8_data order by id """ 
+    
+        qt_2 """ desc function  HDFS(
+                "uri" = "${uri}",
+                "hadoop.username" = "${hdfsUserName}",            
+                "format" = "csv",
+                "column_separator"=",")"""    
+
+
+        qt_3 """select * from   HDFS(
+                "uri" = "${uri}",
+                "hadoop.username" = "${hdfsUserName}",            
+                "format" = "csv",
+                "column_separator"=",") order by c1"""    
+        qt_4 """select * from invalid_utf8_data2 order by id """ 
+    
+    
+    }
+
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to