This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.1-lakehouse in repository https://gitbox.apache.org/repos/asf/doris.git
commit f04ba345104d91183469b18fdccd53f9e509b5d2 Author: morningman <morning...@163.com> AuthorDate: Mon Feb 17 15:31:55 2025 +0800 Revert "[enchement](utf8)import enable_text_validate_utf8 session var (#45537) (#46070)" This reverts commit a380f5d2228517492b863e6998c05093842b599f. --- be/src/util/utf8_check.cpp | 7 -- be/src/util/utf8_check.h | 4 - be/src/vec/exec/format/csv/csv_reader.cpp | 6 +- .../scripts/create_preinstalled_scripts/run72.hql | 31 ------- .../text/utf8_check/utf8_check_fail.csv | 5 -- .../doris/datasource/hive/source/HiveScanNode.java | 2 - .../java/org/apache/doris/qe/SessionVariable.java | 9 -- .../ExternalFileTableValuedFunction.java | 2 - gensrc/thrift/PlanNodes.thrift | 2 - .../external_table_p0/hive/test_utf8_check.out | Bin 1193 -> 0 bytes .../external_table_p0/hive/test_utf8_check.groovy | 100 --------------------- 11 files changed, 3 insertions(+), 165 deletions(-) diff --git a/be/src/util/utf8_check.cpp b/be/src/util/utf8_check.cpp index f90c27e5e91..5355b901420 100644 --- a/be/src/util/utf8_check.cpp +++ b/be/src/util/utf8_check.cpp @@ -327,11 +327,4 @@ bool validate_utf8(const char* src, size_t len) { return validate_utf8_naive(src, len); } #endif - -bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len) { - if (params.__isset.file_attributes && !params.file_attributes.enable_text_validate_utf8) { - return true; - } - return validate_utf8(src, len); -} } // namespace doris diff --git a/be/src/util/utf8_check.h b/be/src/util/utf8_check.h index 7e9b7a2a9de..4214e186b71 100644 --- a/be/src/util/utf8_check.h +++ b/be/src/util/utf8_check.h @@ -17,8 +17,6 @@ #pragma once -#include <gen_cpp/PlanNodes_types.h> - #include <cstddef> namespace doris { @@ -27,6 +25,4 @@ namespace doris { bool validate_utf8(const char* src, size_t len); // check utf8 use naive c++ bool validate_utf8_naive(const char* data, size_t len); - -bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len); } // namespace doris diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 77a5b65d512..1fc3bbad294 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -715,7 +715,7 @@ Status CsvReader::_fill_empty_line(Block* block, std::vector<MutableColumnPtr>& } Status CsvReader::_validate_line(const Slice& line, bool* success) { - if (!_is_proto_format && !validate_utf8(_params, line.data, line.size)) { + if (!_is_proto_format && !validate_utf8(line.data, line.size)) { if (!_is_load) { return Status::InternalError<false>("Only support csv data in utf8 codec"); } else { @@ -951,7 +951,7 @@ Status CsvReader::_parse_col_nums(size_t* col_nums) { return Status::InternalError<false>( "The first line is empty, can not parse column numbers"); } - if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) { + if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) { return Status::InternalError<false>("Only support csv data in utf8 codec"); } ptr = _remove_bom(ptr, size); @@ -968,7 +968,7 @@ Status CsvReader::_parse_col_names(std::vector<std::string>* col_names) { if (size == 0) { return Status::InternalError<false>("The first line is empty, can not parse column names"); } - if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) { + if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) { return Status::InternalError<false>("Only support csv data in utf8 codec"); } ptr = _remove_bom(ptr, size); diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql deleted file mode 100644 index 1ab754b5042..00000000000 --- a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run72.hql +++ /dev/null @@ -1,31 +0,0 @@ -CREATE TABLE invalid_utf8_data ( - id INT, - corrupted_data STRING, - string_data1 STRING, - string_data2 STRING -) -ROW FORMAT DELIMITED -FIELDS TERMINATED BY ',' -LINES TERMINATED BY '\n' -location '/user/doris/preinstalled_data/text/utf8_check'; - - -CREATE TABLE invalid_utf8_data2 ( - id INT, - corrupted_data STRING, - string_data1 STRING, - string_data2 STRING -) -ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' -WITH SERDEPROPERTIES ( - "separatorChar" = ",", - "quoteChar" = "\"", - "escapeChar" = "\\" -) -location '/user/doris/preinstalled_data/text/utf8_check'; - - - -msck repair table invalid_utf8_data; -msck repair table invalid_utf8_data2; - diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv deleted file mode 100644 index 391cd493660..00000000000 --- a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/text/utf8_check/utf8_check_fail.csv +++ /dev/null @@ -1,5 +0,0 @@ -1,�,AAB,helloworld -2,��,AAB,helloworld -2,���,AAB,helloworld -4,����,AAB,helloworld -5,�����,AAB,helloworld diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java index 8f965ddf022..7b0b3fd1c19 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java @@ -452,8 +452,6 @@ public class HiveScanNode extends FileQueryScanNode { TFileAttributes fileAttributes = new TFileAttributes(); fileAttributes.setTextParams(textParams); fileAttributes.setHeaderType(""); - fileAttributes.setEnableTextValidateUtf8( - ConnectContext.get().getSessionVariable().enableTextValidateUtf8); if (textParams.isSet(TFileTextScanRangeParams._Fields.ENCLOSE)) { fileAttributes.setTrimDoubleQuotes(true); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index f68fd1423c3..5e0c3b345e3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -667,8 +667,6 @@ public class SessionVariable implements Serializable, Writable { */ public static final String ENABLE_AUTO_CREATE_WHEN_OVERWRITE = "enable_auto_create_when_overwrite"; - public static final String ENABLE_TEXT_VALIDATE_UTF8 = "enable_text_validate_utf8"; - /** * If set false, user couldn't submit analyze SQL and FE won't allocate any related resources. */ @@ -2230,13 +2228,6 @@ public class SessionVariable implements Serializable, Writable { }) public boolean enableAutoCreateWhenOverwrite = false; - @VariableMgr.VarAttr(name = ENABLE_TEXT_VALIDATE_UTF8, needForward = true, description = { - "对于 text 类型的文件读取,是否开启utf8编码检查。非utf8字符会显示成乱码。", - "For text type file reading, whether to enable utf8 encoding check." - + "non-utf8 characters will be displayed as garbled characters." - }) - public boolean enableTextValidateUtf8 = true; - @VariableMgr.VarAttr(name = SKIP_CHECKING_ACID_VERSION_FILE, needForward = true, description = { "跳过检查 transactional hive 版本文件 '_orc_acid_version.'", "Skip checking transactional hive version file '_orc_acid_version.'" diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java index cb1a2d89c5d..1f65921832b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java @@ -304,8 +304,6 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio fileAttributes.setHeaderType(this.headerType); fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes); fileAttributes.setSkipLines(skipLines); - fileAttributes.setEnableTextValidateUtf8( - ConnectContext.get().getSessionVariable().enableTextValidateUtf8); } else if (this.fileFormatType == TFileFormatType.FORMAT_JSON) { fileAttributes.setJsonRoot(jsonRoot); fileAttributes.setJsonpaths(jsonPaths); diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 7ccb12b3331..1b873787765 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -284,8 +284,6 @@ struct TFileAttributes { 10: optional bool trim_double_quotes; // csv skip line num, only used when csv header_type is not set. 11: optional i32 skip_lines; - //For text type file reading, whether to enable utf8 encoding check.(Catalog && TVF) - 12: optional bool enable_text_validate_utf8 = true; } struct TIcebergDeleteFileDesc { diff --git a/regression-test/data/external_table_p0/hive/test_utf8_check.out b/regression-test/data/external_table_p0/hive/test_utf8_check.out deleted file mode 100644 index 7557e789d49..00000000000 Binary files a/regression-test/data/external_table_p0/hive/test_utf8_check.out and /dev/null differ diff --git a/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy b/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy deleted file mode 100644 index aa26fdede73..00000000000 --- a/regression-test/suites/external_table_p0/hive/test_utf8_check.groovy +++ /dev/null @@ -1,100 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - - -suite("test_utf8_check","p0,external,tvf,hive,external_docker,external_docker_hive") { - String enabled = context.config.otherConfigs.get("enableHiveTest") - if (enabled == null || !enabled.equalsIgnoreCase("true")) { - logger.info("diable Hive test.") - return; - } - - for (String hivePrefix : ["hive2","hive3"]) { - - String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") - String catalog_name = "${hivePrefix}_test_utf8_check" - String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") - def hdfsUserName = "doris" - String hdfs_port = context.config.otherConfigs.get(hivePrefix + "HdfsPort") - def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}" - - sql """drop catalog if exists ${catalog_name}""" - sql """create catalog if not exists ${catalog_name} properties ( - "type"="hms", - 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' - );""" - sql """use `${catalog_name}`.`default`""" - - - sql """ set enable_text_validate_utf8 = true; """ - - test { - sql """ select * from invalid_utf8_data """ - exception """Only support csv data in utf8 codec""" - } - - - test { - sql """ select * from invalid_utf8_data2; """ - exception """Only support csv data in utf8 codec""" - } - - - def uri = "${defaultFS}" + "/user/doris/preinstalled_data/text/utf8_check/utf8_check_fail.csv" - - - test { - sql """ desc function HDFS( - "uri" = "${uri}", - "hadoop.username" = "${hdfsUserName}", - "format" = "csv", - "column_separator"=",")""" - exception """Only support csv data in utf8 codec""" - } - - test { - sql """select * from HDFS( - "uri" = "${uri}", - "hadoop.username" = "${hdfsUserName}", - "format" = "csv", - "column_separator"=",")""" - exception """Only support csv data in utf8 codec""" - } - - - sql """ set enable_text_validate_utf8 = false; """ - - qt_1 """select * from invalid_utf8_data order by id """ - - qt_2 """ desc function HDFS( - "uri" = "${uri}", - "hadoop.username" = "${hdfsUserName}", - "format" = "csv", - "column_separator"=",")""" - - - qt_3 """select * from HDFS( - "uri" = "${uri}", - "hadoop.username" = "${hdfsUserName}", - "format" = "csv", - "column_separator"=",") order by c1""" - qt_4 """select * from invalid_utf8_data2 order by id """ - - - } - -} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org