This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 124516c1ea [Fix](orc-reader) Fix `Wrong data type for column` error 
when column order in hive table is not same in orc file schema. (#21306)
124516c1ea is described below

commit 124516c1ea429c6743093e41d607fb3de0a29e3c
Author: Qi Chen <kaka11.c...@gmail.com>
AuthorDate: Mon Jul 3 09:32:55 2023 +0800

    [Fix](orc-reader) Fix `Wrong data type for column` error when column order 
in hive table is not same in orc file schema. (#21306)
    
    `Wrong data type for column` error when column order in hive table is not 
same in orc file schema.
    
    The root cause is in order to handle the following case:
    
    The table in orc format of Hive 1.x may encounter system column names such 
as `_col0`, `_col1`, `_col2`... in the underlying orc file schema, which need 
to use the column names in the hive table for mapping.
    
    ### Solution
    Currently fix this issue by handling the following case by specifying hive 
version to 1.x.x in the hive catalog configuration.
    
    ```sql
    CREATE CATALOG hive PROPERTIES (
        'hive.version' = '1.x.x'
    );
    ```
---
 .../hive/scripts/create_preinstalled_table.hql     |  20 +++++++
 .../orc/test_different_column_orders.orc           | Bin 0 -> 671 bytes
 .../parquet/test_different_column_orders.parquet   | Bin 0 -> 925 bytes
 docs/en/docs/lakehouse/faq.md                      |  11 +++-
 docs/zh-CN/docs/lakehouse/faq.md                   |   8 +++
 .../doris/catalog/external/HMSExternalTable.java   |   4 ++
 .../doris/datasource/HMSExternalCatalog.java       |   4 ++
 .../doris/datasource/hive/HiveVersionUtil.java     |  22 ++++++++
 .../doris/planner/external/HiveScanNode.java       |   5 +-
 .../hive/test_different_column_orders.out          |  29 ++++++++++
 .../hive/test_different_column_orders.groovy       |  60 +++++++++++++++++++++
 11 files changed, 161 insertions(+), 2 deletions(-)

diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
index 69c58d0ac8..9f34c121df 100644
--- 
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql
@@ -660,4 +660,24 @@ update orc_full_acid_par set value = 'BB' where id = 2;
 alter table orc_full_acid_par PARTITION(part_col=20230101) compact 'major';
 alter table orc_full_acid_par PARTITION(part_col=20230102) compact 'major';
 
+CREATE TABLE `test_different_column_orders_orc`(
+  `name` string,
+  `id` int,
+  `city` string,
+  `age` int,
+  `sex` string)
+STORED AS ORC
+LOCATION
+  '/user/doris/preinstalled_data/test_different_column_orders/orc';
+
+CREATE TABLE `test_different_column_orders_parquet`(
+  `name` string,
+  `id` int,
+  `city` string,
+  `age` int,
+  `sex` string)
+STORED AS PARQUET
+LOCATION
+  '/user/doris/preinstalled_data/test_different_column_orders/parquet';
+
 show tables;
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/test_different_column_orders/orc/test_different_column_orders.orc
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/test_different_column_orders/orc/test_different_column_orders.orc
new file mode 100644
index 0000000000..3b8313cc4a
Binary files /dev/null and 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/test_different_column_orders/orc/test_different_column_orders.orc
 differ
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/test_different_column_orders/parquet/test_different_column_orders.parquet
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/test_different_column_orders/parquet/test_different_column_orders.parquet
new file mode 100644
index 0000000000..7bdfeea380
Binary files /dev/null and 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/test_different_column_orders/parquet/test_different_column_orders.parquet
 differ
diff --git a/docs/en/docs/lakehouse/faq.md b/docs/en/docs/lakehouse/faq.md
index ee40736f4d..ac9688bf08 100644
--- a/docs/en/docs/lakehouse/faq.md
+++ b/docs/en/docs/lakehouse/faq.md
@@ -138,4 +138,13 @@ under the License.
             }
         ]
     }
-    ```
\ No newline at end of file
+    ```
+
+13. The table in orc format of Hive 1.x may encounter system column names such 
as `_col0`, `_col1`, `_col2`... in the underlying orc file schema, which need 
to be specified in the catalog configuration. Add `hive.version` to 1.x.x so 
that it will use the column names in the hive table for mapping.
+
+    ```sql
+    CREATE CATALOG hive PROPERTIES (
+        'hive.version' = '1.x.x'
+    );
+    ```
+
diff --git a/docs/zh-CN/docs/lakehouse/faq.md b/docs/zh-CN/docs/lakehouse/faq.md
index 4ba1910f4f..24a6ae94f2 100644
--- a/docs/zh-CN/docs/lakehouse/faq.md
+++ b/docs/zh-CN/docs/lakehouse/faq.md
@@ -140,3 +140,11 @@ under the License.
     }
     ```
 
+13. Hive 1.x 的 orc 格式的表可能会遇到底层 orc 文件 schema 中列名为 `_col0`,`_col1`,`_col2`... 
这类系统列名,此时需要在 catalog 配置中添加 `hive.version` 为 1.x.x,这样就会使用 hive 表中的列名进行映射。
+
+    ```sql
+    CREATE CATALOG hive PROPERTIES (
+        'hive.version' = '1.x.x'
+    );
+    ```
+
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
index 3e10f6636d..c9f03fd80c 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
@@ -341,6 +341,10 @@ public class HMSExternalTable extends ExternalTable {
         return ((HMSExternalCatalog) catalog).getHiveMetastoreUris();
     }
 
+    public String getHiveVersion() {
+        return ((HMSExternalCatalog) catalog).getHiveVersion();
+    }
+
     public Map<String, String> getCatalogProperties() {
         return catalog.getProperties();
     }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/HMSExternalCatalog.java 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/HMSExternalCatalog.java
index 4d6c8b86fe..9d4fc479a9 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/HMSExternalCatalog.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/HMSExternalCatalog.java
@@ -124,6 +124,10 @@ public class HMSExternalCatalog extends ExternalCatalog {
         return catalogProperty.getOrDefault(HMSProperties.HIVE_METASTORE_URIS, 
"");
     }
 
+    public String getHiveVersion() {
+        return catalogProperty.getOrDefault(HMSProperties.HIVE_VERSION, "");
+    }
+
     protected List<String> listDatabaseNames() {
         return client.getAllDatabases();
     }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveVersionUtil.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveVersionUtil.java
index 749520b62c..b93d5653ad 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveVersionUtil.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveVersionUtil.java
@@ -72,4 +72,26 @@ public class HiveVersionUtil {
             return DEFAULT_HIVE_VERSION;
         }
     }
+
+    public static boolean isHive1(String version) {
+        if (Strings.isNullOrEmpty(version)) {
+            return false;
+        }
+        String[] parts = version.split("\\.");
+        if (parts.length < 2) {
+            LOG.warn("invalid hive version: " + version);
+            return false;
+        }
+        try {
+            int major = Integer.parseInt(parts[0]);
+            if (major == 1) {
+                return true;
+            } else {
+                return false;
+            }
+        } catch (NumberFormatException e) {
+            LOG.warn("invalid hive version: " + version);
+            return false;
+        }
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java 
b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java
index 738a2e3933..710fed10dd 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java
@@ -37,6 +37,7 @@ import org.apache.doris.datasource.hive.HiveMetaStoreCache;
 import org.apache.doris.datasource.hive.HiveMetaStoreCache.FileCacheValue;
 import org.apache.doris.datasource.hive.HivePartition;
 import org.apache.doris.datasource.hive.HiveTransaction;
+import org.apache.doris.datasource.hive.HiveVersionUtil;
 import org.apache.doris.planner.ListPartitionPrunerV2;
 import org.apache.doris.planner.PlanNodeId;
 import org.apache.doris.planner.external.HiveSplit.HiveSplitCreator;
@@ -91,7 +92,9 @@ public class HiveScanNode extends FileQueryScanNode {
     @Override
     protected void doInitialize() throws UserException {
         super.doInitialize();
-        genSlotToSchemaIdMap();
+        if (HiveVersionUtil.isHive1(hmsTable.getHiveVersion())) {
+            genSlotToSchemaIdMap();
+        }
         String inputFormat = 
hmsTable.getRemoteTable().getSd().getInputFormat();
         if (inputFormat.contains("TextInputFormat")) {
             for (SlotDescriptor slot : desc.getSlots()) {
diff --git 
a/regression-test/data/external_catalog_p0/hive/test_different_column_orders.out
 
b/regression-test/data/external_catalog_p0/hive/test_different_column_orders.out
new file mode 100644
index 0000000000..6e3fc17262
--- /dev/null
+++ 
b/regression-test/data/external_catalog_p0/hive/test_different_column_orders.out
@@ -0,0 +1,29 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !q01 --
+Tom    1       shanghai        48      male
+Jerry  2       guangzhou       35      male
+Frank  3       hangzhou        25      male
+Ada    4       beijing 22      female
+
+-- !q02 --
+4
+
+-- !q03 --
+guangzhou      1
+hangzhou       1
+shanghai       1
+
+-- !q01 --
+Tom    1       shanghai        48      male
+Jerry  2       guangzhou       35      male
+Frank  3       hangzhou        25      male
+Ada    4       beijing 22      female
+
+-- !q02 --
+4
+
+-- !q03 --
+guangzhou      1
+hangzhou       1
+shanghai       1
+
diff --git 
a/regression-test/suites/external_catalog_p0/hive/test_different_column_orders.groovy
 
b/regression-test/suites/external_catalog_p0/hive/test_different_column_orders.groovy
new file mode 100644
index 0000000000..3fcb5a3c3f
--- /dev/null
+++ 
b/regression-test/suites/external_catalog_p0/hive/test_different_column_orders.groovy
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_different_column_orders", "p0") {
+    def q_parquet = {
+        qt_q01 """
+        select * from test_different_column_orders_parquet order by id;
+        """
+        qt_q02 """
+        select count(id) from test_different_column_orders_parquet;
+        """
+        qt_q03 """
+        select city, count(*) from test_different_column_orders_parquet where 
sex = 'male' group by city order by city;
+        """
+    }
+    def q_orc = {
+        qt_q01 """
+        select * from test_different_column_orders_orc order by id;
+        """
+        qt_q02 """
+        select count(id) from test_different_column_orders_orc;
+        """
+        qt_q03 """
+        select city, count(*) from test_different_column_orders_orc where sex 
= 'male' group by city order by city;
+        """
+    }
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        try {
+            String hms_port = context.config.otherConfigs.get("hms_port")
+            String catalog_name = "test_different_column_orders"
+            sql """drop catalog if exists ${catalog_name}"""
+            sql """create catalog if not exists ${catalog_name} properties (
+                "type"="hms",
+                'hive.metastore.uris' = 'thrift://127.0.0.1:${hms_port}'
+            );"""
+            sql """use `${catalog_name}`.`default`"""
+
+            q_parquet()
+            q_orc()
+
+            sql """drop catalog if exists ${catalog_name}"""
+        } finally {
+        }
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to