[doris] branch master updated: [Feature](multi-catalog) Truncate char or varchar columns if size is smaller than file columns or not found in the file column schema. (#22318)

ashingau Wed, 09 Aug 2023 23:37:37 -0700

This is an automated email from the ASF dual-hosted git repository.

ashingau pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new f2658dc7bd [Feature](multi-catalog) Truncate char or varchar columns 
if size is smaller than file columns or not found in the file column schema. 
(#22318)
f2658dc7bd is described below

commit f2658dc7bdae515aa5e1c011667fd4fdae6a702c
Author: Qi Chen <kaka11.c...@gmail.com>
AuthorDate: Thu Aug 10 14:37:20 2023 +0800

    [Feature](multi-catalog) Truncate char or varchar columns if size is 
smaller than file columns or not found in the file column schema. (#22318)
    
    Truncate char or varchar columns if size is smaller than file columns or 
not found in the file column schema by session var 
`truncate_char_or_varchar_columns`.
---
 be/src/vec/exec/format/orc/vorc_reader.cpp         |  4 +-
 be/src/vec/exec/format/table/table_format_reader.h |  5 ++
 be/src/vec/exec/scan/vfile_scanner.cpp             | 73 +++++++++++++++++++
 be/src/vec/exec/scan/vfile_scanner.h               |  7 ++
 docs/en/docs/advanced/variables.md                 |  6 ++
 docs/en/docs/lakehouse/multi-catalog/hive.md       |  6 ++
 docs/zh-CN/docs/advanced/variables.md              |  6 ++
 docs/zh-CN/docs/lakehouse/multi-catalog/hive.md    |  6 ++
 .../java/org/apache/doris/catalog/ScalarType.java  |  4 +-
 .../doris/catalog/HiveMetaStoreClientHelper.java   | 10 +--
 .../java/org/apache/doris/qe/SessionVariable.java  | 25 +++++++
 gensrc/thrift/PaloInternalService.thrift           |  2 +
 .../hive/test_truncate_char_or_varchar_columns.out | 85 ++++++++++++++++++++++
 .../test_truncate_char_or_varchar_columns.groovy   | 78 ++++++++++++++++++++
 14 files changed, 308 insertions(+), 9 deletions(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 964db48027..2e053a298a 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -929,9 +929,9 @@ TypeDescriptor OrcReader::_convert_to_doris_type(const 
orc::Type* orc_type) {
     case orc::TypeKind::DATE:
         return TypeDescriptor(PrimitiveType::TYPE_DATEV2);
     case orc::TypeKind::VARCHAR:
-        return TypeDescriptor(PrimitiveType::TYPE_VARCHAR);
+        return 
TypeDescriptor::create_varchar_type(orc_type->getMaximumLength());
     case orc::TypeKind::CHAR:
-        return TypeDescriptor(PrimitiveType::TYPE_CHAR);
+        return TypeDescriptor::create_char_type(orc_type->getMaximumLength());
     case orc::TypeKind::TIMESTAMP_INSTANT:
         return TypeDescriptor(PrimitiveType::TYPE_DATETIMEV2);
     case orc::TypeKind::LIST: {
diff --git a/be/src/vec/exec/format/table/table_format_reader.h 
b/be/src/vec/exec/format/table/table_format_reader.h
index 0bf7668d25..5ce9856ad8 100644
--- a/be/src/vec/exec/format/table/table_format_reader.h
+++ b/be/src/vec/exec/format/table/table_format_reader.h
@@ -50,6 +50,11 @@ public:
         return _file_format_reader->get_columns(name_to_type, missing_cols);
     }
 
+    Status get_parsed_schema(std::vector<std::string>* col_names,
+                             std::vector<TypeDescriptor>* col_types) override {
+        return _file_format_reader->get_parsed_schema(col_names, col_types);
+    }
+
     virtual Status init_row_filters(const TFileRangeDesc& range) = 0;
 
 protected:
diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp 
b/be/src/vec/exec/scan/vfile_scanner.cpp
index 6814659717..513c384930 100644
--- a/be/src/vec/exec/scan/vfile_scanner.cpp
+++ b/be/src/vec/exec/scan/vfile_scanner.cpp
@@ -71,6 +71,7 @@
 #include "vec/exprs/vexpr_context.h"
 #include "vec/exprs/vslot_ref.h"
 #include "vec/functions/function.h"
+#include "vec/functions/function_string.h"
 #include "vec/functions/simple_function_factory.h"
 
 namespace cctz {
@@ -273,6 +274,9 @@ Status VFileScanner::_get_block_impl(RuntimeState* state, 
Block* block, bool* eo
             RETURN_IF_ERROR(_pre_filter_src_block());
             // Convert src block to output block (dest block), string to dest 
data type and apply filters.
             RETURN_IF_ERROR(_convert_to_output_block(block));
+            // Truncate char columns or varchar columns if size is smaller 
than file columns
+            // or not found in the file column schema.
+            RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block));
             break;
         }
     } while (true);
@@ -562,6 +566,57 @@ Status VFileScanner::_convert_to_output_block(Block* 
block) {
     return Status::OK();
 }
 
+Status VFileScanner::_truncate_char_or_varchar_columns(Block* block) {
+    // Truncate char columns or varchar columns if size is smaller than file 
columns
+    // or not found in the file column schema.
+    if (!_state->query_options().truncate_char_or_varchar_columns) {
+        return Status::OK();
+    }
+    int idx = 0;
+    for (auto slot_desc : _real_tuple_desc->slots()) {
+        if (!slot_desc->is_materialized()) {
+            continue;
+        }
+        const TypeDescriptor& type_desc = slot_desc->type();
+        if (type_desc.type != TYPE_VARCHAR && type_desc.type != TYPE_CHAR) {
+            ++idx;
+            continue;
+        }
+        auto iter = _source_file_col_name_types.find(slot_desc->col_name());
+        if (iter != _source_file_col_name_types.end()) {
+            const TypeDescriptor* file_type_desc =
+                    _source_file_col_name_types[slot_desc->col_name()];
+            if ((type_desc.len > 0) &&
+                (type_desc.len < file_type_desc->len || file_type_desc->len < 
0)) {
+                _truncate_char_or_varchar_column(block, idx, type_desc.len);
+            }
+        } else {
+            _truncate_char_or_varchar_column(block, idx, type_desc.len);
+        }
+        ++idx;
+    }
+    return Status::OK();
+}
+
+// VARCHAR substring(VARCHAR str, INT pos[, INT len])
+void VFileScanner::_truncate_char_or_varchar_column(Block* block, int idx, int 
len) {
+    auto int_type = std::make_shared<DataTypeInt32>();
+    size_t num_columns_without_result = block->columns();
+    block->insert({int_type->create_column_const(block->rows(), to_field(1)), 
int_type,
+                   "const 1"}); // pos is 1
+    block->insert({int_type->create_column_const(block->rows(), 
to_field(len)), int_type,
+                   fmt::format("const {}", len)});                          // 
len
+    block->insert({nullptr, std::make_shared<DataTypeString>(), "result"}); // 
result column
+    ColumnNumbers temp_arguments(3);
+    temp_arguments[0] = idx;                            // str column
+    temp_arguments[1] = num_columns_without_result;     // pos
+    temp_arguments[2] = num_columns_without_result + 1; // len
+    size_t result_column_id = num_columns_without_result + 2;
+    SubstringUtil::substring_execute(*block, temp_arguments, result_column_id, 
block->rows());
+    block->replace_by_position(idx, 
block->get_by_position(result_column_id).column);
+    Block::erase_useless_column(block, num_columns_without_result);
+}
+
 Status VFileScanner::_get_next_reader() {
     while (true) {
         if (_cur_reader) {
@@ -594,6 +649,7 @@ Status VFileScanner::_get_next_reader() {
                 format_type = TFileFormatType::FORMAT_PARQUET;
             }
         }
+        bool need_to_get_parsed_schema = false;
         switch (format_type) {
         case TFileFormatType::FORMAT_JNI: {
             if (_real_tuple_desc->table_desc()->table_type() ==
@@ -662,6 +718,7 @@ Status VFileScanner::_get_next_reader() {
                         &_slot_id_to_filter_conjuncts);
                 _cur_reader = std::move(parquet_reader);
             }
+            need_to_get_parsed_schema = true;
             break;
         }
         case TFileFormatType::FORMAT_ORC: {
@@ -694,6 +751,7 @@ Status VFileScanner::_get_next_reader() {
                         &_not_single_slot_filter_conjuncts, 
&_slot_id_to_filter_conjuncts);
                 _cur_reader = std::move(orc_reader);
             }
+            need_to_get_parsed_schema = true;
             break;
         }
         case TFileFormatType::FORMAT_CSV_PLAIN:
@@ -754,6 +812,21 @@ Status VFileScanner::_get_next_reader() {
             VLOG_NOTICE << fmt::format("Unknown columns:{} in file {}", 
fmt::to_string(col_buf),
                                        range.path);
         }
+
+        _source_file_col_names.clear();
+        _source_file_col_types.clear();
+        _source_file_col_name_types.clear();
+        if (_state->query_options().truncate_char_or_varchar_columns && 
need_to_get_parsed_schema) {
+            Status status = 
_cur_reader->get_parsed_schema(&_source_file_col_names,
+                                                           
&_source_file_col_types);
+            if (status != Status::OK() && status.code() != 
TStatusCode::NOT_IMPLEMENTED_ERROR) {
+                return status;
+            }
+            DCHECK(_source_file_col_names.size() == 
_source_file_col_types.size());
+            for (int i = 0; i < _source_file_col_names.size(); ++i) {
+                _source_file_col_name_types[_source_file_col_names[i]] = 
&_source_file_col_types[i];
+            }
+        }
         _cur_reader_eof = false;
         break;
     }
diff --git a/be/src/vec/exec/scan/vfile_scanner.h 
b/be/src/vec/exec/scan/vfile_scanner.h
index ff47f21dd5..62f5b2d8ff 100644
--- a/be/src/vec/exec/scan/vfile_scanner.h
+++ b/be/src/vec/exec/scan/vfile_scanner.h
@@ -129,6 +129,11 @@ protected:
     // These columns will be filled by default value or null.
     std::unordered_set<std::string> _missing_cols;
 
+    //  The col names and types of source file, such as parquet, orc files.
+    std::vector<std::string> _source_file_col_names;
+    std::vector<TypeDescriptor> _source_file_col_types;
+    std::map<std::string, TypeDescriptor*> _source_file_col_name_types;
+
     // For load task
     vectorized::VExprContextSPtrs _pre_conjunct_ctxs;
     std::unique_ptr<RowDescriptor> _src_row_desc;
@@ -195,6 +200,8 @@ private:
     Status _fill_missing_columns(size_t rows);
     Status _pre_filter_src_block();
     Status _convert_to_output_block(Block* block);
+    Status _truncate_char_or_varchar_columns(Block* block);
+    void _truncate_char_or_varchar_column(Block* block, int idx, int len);
     Status _generate_fill_columns();
     Status _handle_dynamic_block(Block* block);
     Status _process_conjuncts_for_dict_filter();
diff --git a/docs/en/docs/advanced/variables.md 
b/docs/en/docs/advanced/variables.md
index 2947962502..df9983c52b 100644
--- a/docs/en/docs/advanced/variables.md
+++ b/docs/en/docs/advanced/variables.md
@@ -671,6 +671,12 @@ Translated with www.DeepL.com/Translator (free version)
 
   Used to enable strong consistent reading. By default, Doris supports strong 
consistency within the same session, that is, changes to data within the same 
session are visible in real time. If you want strong consistent reads between 
sessions, set this variable to true. 
 
+* `truncate_char_or_varchar_columns`
+
+  Whether to truncate char or varchar columns according to the table's schema. 
The default is false.
+
+  Because the maximum length of the char or varchar column in the schema of 
the table is inconsistent with the schema in the underlying parquet or orc 
file. At this time, if the option is turned on, it will be truncated according 
to the maximum length in the schema of the table.
+
 ***
 
 #### Supplementary instructions on statement execution timeout control
diff --git a/docs/en/docs/lakehouse/multi-catalog/hive.md 
b/docs/en/docs/lakehouse/multi-catalog/hive.md
index 26c1b98042..87e562ee88 100644
--- a/docs/en/docs/lakehouse/multi-catalog/hive.md
+++ b/docs/en/docs/lakehouse/multi-catalog/hive.md
@@ -233,6 +233,12 @@ For Hive/Iceberge/Hudi
 | `struct<col1: Type1, col2: Type2, ...>` | `struct<col1: Type1, col2: Type2, 
...>` | 暂不支持嵌套，Type1, Type2, ... 需要为基础类型 |
 | other | unsupported | |
 
+## Whether to truncate char or varchar columns according to the schema of the 
hive table
+
+If the variable `truncate_char_or_varchar_columns` is enabled, when the 
maximum length of the char or varchar column in the schema of the hive table is 
inconsistent with the schema in the underlying parquet or orc file, it will be 
truncated according to the maximum length of the hive table column.
+
+The variable default is false.
+
 ## Integrate with Apache Ranger
 
 Apache Ranger is a security framework for monitoring, enabling services, and 
comprehensive data security access management on the Hadoop platform.
diff --git a/docs/zh-CN/docs/advanced/variables.md 
b/docs/zh-CN/docs/advanced/variables.md
index eca96180ea..4f545c35d5 100644
--- a/docs/zh-CN/docs/advanced/variables.md
+++ b/docs/zh-CN/docs/advanced/variables.md
@@ -658,6 +658,12 @@ try (Connection conn = 
DriverManager.getConnection("jdbc:mysql://127.0.0.1:9030/
 
   用以开启强一致读。Doris 
默认支持同一个会话内的强一致性，即同一个会话内对数据的变更操作是实时可见的。如需要会话间的强一致读，则需将此变量设置为true。
 
+* `truncate_char_or_varchar_columns`
+
+  是否按照表的 schema 来截断 char 或者 varchar 列。默认为 false。
+
+  因为外表会存在表的 schema 中 char 或者 varchar 列的最大长度和底层 parquet 或者 orc 文件中的 schema 
不一致的情况。此时开启改选项，会按照表的 schema 中的最大长度进行截断。
+
 ***
 
 #### 关于语句执行超时控制的补充说明
diff --git a/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md 
b/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md
index 779442b81a..3242426184 100644
--- a/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md
+++ b/docs/zh-CN/docs/lakehouse/multi-catalog/hive.md
@@ -232,6 +232,12 @@ CREATE CATALOG hive PROPERTIES (
 | `struct<col1: Type1, col2: Type2, ...>` | `struct<col1: Type1, col2: Type2, 
...>` | 暂不支持嵌套，Type1, Type2, ... 需要为基础类型 |
 | other | unsupported | |
 
+## 是否按照 hive 表的 schema 来截断 char 或者 varchar 列
+
+如果变量 `truncate_char_or_varchar_columns` 开启，则当 hive 表的 schema 中 char 或者 varchar 
列的最大长度和底层 parquet 或者 orc 文件中的 schema 不一致时会按照 hive 表列的最大长度进行截断。
+
+该变量默认为 false。
+
 ## 使用 Ranger 进行权限校验
 
 Apache Ranger是一个用来在Hadoop平台上进行监控，启用服务，以及全方位数据安全访问管理的安全框架。
diff --git 
a/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java 
b/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java
index 00c788f363..ceffd27fe9 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java
@@ -517,7 +517,9 @@ public class ScalarType extends Type {
     }
 
     public static ScalarType createVarcharType() {
-        return DEFAULT_VARCHAR;
+        // Because ScalarType is not an immutable class, it will call 
setLength() sometimes.
+        // So currently don't use DEFAULT_VARCHAR, will improve it in the 
future.
+        return new ScalarType(PrimitiveType.VARCHAR);
     }
 
     public static ScalarType createHllType() {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java
 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java
index 586bdd73c4..4b5c2b1d7c 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java
@@ -781,20 +781,18 @@ public class HiveMetaStoreClientHelper {
             }
         }
         if (lowerCaseType.startsWith("char")) {
-            ScalarType type = ScalarType.createType(PrimitiveType.CHAR);
             Matcher match = digitPattern.matcher(lowerCaseType);
             if (match.find()) {
-                type.setLength(Integer.parseInt(match.group(1)));
+                return ScalarType.createType(PrimitiveType.CHAR, 
Integer.parseInt(match.group(1)), 0, 0);
             }
-            return type;
+            return ScalarType.createType(PrimitiveType.CHAR);
         }
         if (lowerCaseType.startsWith("varchar")) {
-            ScalarType type = ScalarType.createType(PrimitiveType.VARCHAR);
             Matcher match = digitPattern.matcher(lowerCaseType);
             if (match.find()) {
-                type.setLength(Integer.parseInt(match.group(1)));
+                return ScalarType.createType(PrimitiveType.VARCHAR, 
Integer.parseInt(match.group(1)), 0, 0);
             }
-            return type;
+            return ScalarType.createType(PrimitiveType.VARCHAR);
         }
         if (lowerCaseType.startsWith("decimal")) {
             Matcher match = digitPattern.matcher(lowerCaseType);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 2d0b7737b3..bc2d8fc97e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -365,6 +365,8 @@ public class SessionVariable implements Serializable, 
Writable {
 
     public static final String PARALLEL_SYNC_ANALYZE_TASK_NUM = 
"parallel_sync_analyze_task_num";
 
+    public static final String TRUNCATE_CHAR_OR_VARCHAR_COLUMNS = 
"truncate_char_or_varchar_columns";
+
     public static final String CBO_CPU_WEIGHT = "cbo_cpu_weight";
 
     public static final String CBO_MEM_WEIGHT = "cbo_mem_weight";
@@ -1086,6 +1088,19 @@ public class SessionVariable implements Serializable, 
Writable {
     @VariableMgr.VarAttr(name = PARALLEL_SYNC_ANALYZE_TASK_NUM)
     public int parallelSyncAnalyzeTaskNum = 2;
 
+    @VariableMgr.VarAttr(name = TRUNCATE_CHAR_OR_VARCHAR_COLUMNS,
+            description = {"是否按照表的 schema 来截断 char 或者 varchar 列。默认为 false。\n"
+                    + "因为外表会存在表的 schema 中 char 或者 varchar 列的最大长度和底层 parquet 或者 
orc 文件中的 schema 不一致"
+                    + "的情况。此时开启改选项，会按照表的 schema 中的最大长度进行截断。",
+                    "Whether to truncate char or varchar columns according to 
the table's schema. "
+                            + "The default is false.\n"
+                    + "Because the maximum length of the char or varchar 
column in the schema of the table"
+                            + " is inconsistent with the schema in the 
underlying parquet or orc file."
+                    + " At this time, if the option is turned on, it will be 
truncated according to the maximum length"
+                            + " in the schema of the table."},
+            needForward = true)
+    public boolean truncateCharOrVarcharColumns = false;
+
     // If this fe is in fuzzy mode, then will use initFuzzyModeVariables to 
generate some variables,
     // not the default value set in the code.
     public void initFuzzyModeVariables() {
@@ -2060,6 +2075,14 @@ public class SessionVariable implements Serializable, 
Writable {
         return externalTableAnalyzePartNum;
     }
 
+    public boolean isTruncateCharOrVarcharColumns() {
+        return truncateCharOrVarcharColumns;
+    }
+
+    public void setTruncateCharOrVarcharColumns(boolean 
truncateCharOrVarcharColumns) {
+        this.truncateCharOrVarcharColumns = truncateCharOrVarcharColumns;
+    }
+
     /**
      * Serialize to thrift object.
      * Used for rest api.
@@ -2144,6 +2167,7 @@ public class SessionVariable implements Serializable, 
Writable {
         tResult.setEnableOrcLazyMat(enableOrcLazyMat);
 
         tResult.setEnableInsertStrict(enableInsertStrict);
+        tResult.setTruncateCharOrVarcharColumns(truncateCharOrVarcharColumns);
 
         return tResult;
     }
@@ -2457,3 +2481,4 @@ public class SessionVariable implements Serializable, 
Writable {
         return connectContext.getSessionVariable().enableAggState;
     }
 }
+
diff --git a/gensrc/thrift/PaloInternalService.thrift 
b/gensrc/thrift/PaloInternalService.thrift
index 99aaf5b011..8928d78438 100644
--- a/gensrc/thrift/PaloInternalService.thrift
+++ b/gensrc/thrift/PaloInternalService.thrift
@@ -227,6 +227,8 @@ struct TQueryOptions {
   75: optional bool enable_insert_strict = false;
 
   76: optional bool enable_inverted_index_query = true;
+
+  77: optional bool truncate_char_or_varchar_columns = false
 }
 
 
diff --git 
a/regression-test/data/external_table_emr_p2/hive/test_truncate_char_or_varchar_columns.out
 
b/regression-test/data/external_table_emr_p2/hive/test_truncate_char_or_varchar_columns.out
new file mode 100644
index 0000000000..5dab20925f
--- /dev/null
+++ 
b/regression-test/data/external_table_emr_p2/hive/test_truncate_char_or_varchar_columns.out
@@ -0,0 +1,85 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !q01 --
+1      han     chi
+2      bei     chi
+3      杭州      中华人
+4      Bos     美利坚
+5      哈尔滨     中华人
+
+-- !q02 --
+han    at han in chi
+bei    at bei in chi
+杭州     at 杭州 in 中华人
+Bos    at Bos in 美利坚
+哈尔滨    at 哈尔滨 in 中华人
+
+-- !q01 --
+1      han     chi
+2      bei     chi
+3      杭州      中华人
+4      Bos     美利坚
+5      哈尔滨     中华人
+
+-- !q02 --
+han    at han in chi
+bei    at bei in chi
+杭州     at 杭州 in 中华人
+Bos    at Bos in 美利坚
+哈尔滨    at 哈尔滨 in 中华人
+
+-- !q01 --
+1      han     chi
+2      bei     chi
+3      杭州      中华人
+4      Bos     美利坚
+5      哈尔滨     中华人
+
+-- !q02 --
+han    at han in chi
+bei    at bei in chi
+杭州     at 杭州 in 中华人
+Bos    at Bos in 美利坚
+哈尔滨    at 哈尔滨 in 中华人
+
+-- !q01 --
+1      hangzhou        china
+2      beijing china
+3      杭州      中华人民共和国
+4      Boston  美利坚合众国
+5      哈尔滨     中华人民共和国
+
+-- !q02 --
+hangzhou       at hangzhou in china
+beijing        at beijing in china
+杭州     at 杭州 in 中华人民共和国
+Boston at Boston in 美利坚合众国
+哈尔滨    at 哈尔滨 in 中华人民共和国
+
+-- !q01 --
+1      hangzhou        china
+2      beijing china
+3      杭州      中华人民共和国
+4      Boston  美利坚合众国
+5      哈尔滨     中华人民共和国
+
+-- !q02 --
+hangzhou       at hangzhou in china
+beijing        at beijing in china
+杭州     at 杭州 in 中华人民共和国
+Boston at Boston in 美利坚合众国
+哈尔滨    at 哈尔滨 in 中华人民共和国
+
+-- !q01 --
+1      hangzhou        china                         
+2      beijing china                         
+3      杭州      中华人民共和国                       
+4      Boston  美利坚合众国                        
+5      哈尔滨     中华人民共和国                       
+
+-- !q02 --
+hangzhou       at hangzhou in china                         
+beijing        at beijing in china                         
+杭州     at 杭州 in 中华人民共和国                       
+Boston at Boston in 美利坚合众国                        
+哈尔滨    at 哈尔滨 in 中华人民共和国                       
+
diff --git 
a/regression-test/suites/external_table_emr_p2/hive/test_truncate_char_or_varchar_columns.groovy
 
b/regression-test/suites/external_table_emr_p2/hive/test_truncate_char_or_varchar_columns.groovy
new file mode 100644
index 0000000000..4fdf7183c6
--- /dev/null
+++ 
b/regression-test/suites/external_table_emr_p2/hive/test_truncate_char_or_varchar_columns.groovy
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_truncate_char_or_varchar_columns", "p2") {
+    String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        String extHiveHmsHost = 
context.config.otherConfigs.get("extHiveHmsHost")
+        String extHiveHmsPort = 
context.config.otherConfigs.get("extHiveHmsPort")
+        String catalog_name = "test_truncate_char_or_varchar_columns"
+
+        sql """drop catalog if exists ${catalog_name};"""
+        sql """
+            create catalog if not exists ${catalog_name} properties (
+                'type'='hms',
+                'hive.metastore.uris' = 
'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
+            );
+        """
+
+        sql """switch ${catalog_name};"""
+        sql """ set truncate_char_or_varchar_columns=true; """
+        // test parquet format
+        def q01_parquet = {
+            qt_q01 """ select * from 
multi_catalog.test_truncate_char_or_varchar_columns_parquet order by id """
+            qt_q02 """ select city, concat("at ", city, " in ", country) from 
regression.multi_catalog.test_truncate_char_or_varchar_columns_parquet order by 
id """
+        }
+        // test orc format
+        def q01_orc = {
+            qt_q01 """ select * from 
multi_catalog.test_truncate_char_or_varchar_columns_orc order by id """
+            qt_q02 """ select city, concat("at ", city, " in ", country) from 
regression.multi_catalog.test_truncate_char_or_varchar_columns_orc order by id 
"""
+        }
+        // test text format
+        def q01_text = {
+            qt_q01 """ select * from 
multi_catalog.test_truncate_char_or_varchar_columns_text order by id """
+            qt_q02 """ select city, concat("at ", city, " in ", country) from 
regression.multi_catalog.test_truncate_char_or_varchar_columns_text order by id 
"""
+        }
+        sql """ use `multi_catalog`; """
+        q01_parquet()
+        q01_orc()
+        q01_text()
+
+        sql """switch ${catalog_name};"""
+        sql """ set truncate_char_or_varchar_columns=false; """
+        // test parquet format
+        def q02_parquet = {
+            qt_q01 """ select * from 
multi_catalog.test_truncate_char_or_varchar_columns_parquet order by id """
+            qt_q02 """ select city, concat("at ", city, " in ", country) from 
regression.multi_catalog.test_truncate_char_or_varchar_columns_parquet order by 
id """
+        }
+        // test orc format
+        def q02_orc = {
+            qt_q01 """ select * from 
multi_catalog.test_truncate_char_or_varchar_columns_orc order by id """
+            qt_q02 """ select city, concat("at ", city, " in ", country) from 
regression.multi_catalog.test_truncate_char_or_varchar_columns_orc order by id 
"""
+        }
+        // test text format
+        def q02_text = {
+            qt_q01 """ select * from 
multi_catalog.test_truncate_char_or_varchar_columns_text order by id """
+            qt_q02 """ select city, concat("at ", city, " in ", country) from 
regression.multi_catalog.test_truncate_char_or_varchar_columns_text order by id 
"""
+        }
+        sql """ use `multi_catalog`; """
+        q02_parquet()
+        q02_orc()
+        q02_text()
+    }
+}
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

[doris] branch master updated: [Feature](multi-catalog) Truncate char or varchar columns if size is smaller than file columns or not found in the file column schema. (#22318)

Reply via email to