(doris) 07/07: [fix](hive)fix select count(*) hive full acid tb opt error. (#46732)

morningman Mon, 17 Feb 2025 02:25:46 -0800

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.1-lakehouse
in repository https://gitbox.apache.org/repos/asf/doris.git


commit 5179917c7852b8bdda16c65788e928cc32ccadb0
Author: daidai <changyu...@selectdb.com>
AuthorDate: Fri Jan 10 18:18:27 2025 +0800

    [fix](hive)fix select count(*) hive full acid tb opt error. (#46732)
    
    ### What problem does this PR solve?
    Problem Summary:
    before pr : #44038
    In the previous PR, the generation method of split in the count( * )
    scenario was optimized.
    However, there were some problems with the hive acid table. This PR
    mainly fixes this and adds tests.
    In the count( * ) scenario, reading the hive full acid table cannot be
    optimized, and the file still needs to be split (merge on read is
    required), and the hive insert only acid table does not need to be
    split.
---
 .../vec/exec/format/table/transactional_hive_reader.cpp |   1 +
 .../doris/datasource/hive/source/HiveScanNode.java      |   4 ++--
 .../external_table_p0/hive/test_transactional_hive.out  | Bin 835 -> 925 bytes
 .../hive/test_hive_translation_insert_only.out          | Bin 181 -> 235 bytes
 .../hive/test_transactional_hive.groovy                 |  12 ++++++++++++
 .../hive/test_hive_translation_insert_only.groovy       |   5 +++++
 6 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/be/src/vec/exec/format/table/transactional_hive_reader.cpp 
b/be/src/vec/exec/format/table/transactional_hive_reader.cpp
index 18642ab1218..caf24270018 100644
--- a/be/src/vec/exec/format/table/transactional_hive_reader.cpp
+++ b/be/src/vec/exec/format/table/transactional_hive_reader.cpp
@@ -205,6 +205,7 @@ Status TransactionalHiveReader::init_row_filters(const 
TFileRangeDesc& range,
         ++num_delete_files;
     }
     if (num_delete_rows > 0) {
+        orc_reader->set_push_down_agg_type(TPushAggOp::NONE);
         orc_reader->set_delete_rows(&_delete_rows);
         COUNTER_UPDATE(_transactional_orc_profile.num_delete_files, 
num_delete_files);
         COUNTER_UPDATE(_transactional_orc_profile.num_delete_rows, 
num_delete_rows);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index d66a4c08e9d..890f6147f33 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -296,12 +296,12 @@ public class HiveScanNode extends FileQueryScanNode {
          * we don't need to split the file because for parquet/orc format, 
only metadata is read.
          * If we split the file, we will read metadata of a file multiple 
times, which is not efficient.
          *
-         * - Hive Transactional Table may need merge on read, so do not apply 
this optimization.
+         * - Hive Full Acid Transactional Table may need merge on read, so do 
not apply this optimization.
          * - If the file format is not parquet/orc, eg, text, we need to split 
the file to increase the parallelism.
          */
         boolean needSplit = true;
         if (getPushDownAggNoGroupingOp() == TPushAggOp.COUNT
-                && hiveTransaction != null) {
+                && !(hmsTable.isHiveTransactionalTable() && 
hmsTable.isFullAcidTable())) {
             int totalFileNum = 0;
             for (FileCacheValue fileCacheValue : fileCaches) {
                 if (fileCacheValue.getFiles() != null) {
diff --git 
a/regression-test/data/external_table_p0/hive/test_transactional_hive.out 
b/regression-test/data/external_table_p0/hive/test_transactional_hive.out
index 060fa8c048e..94e32a43db7 100644
Binary files 
a/regression-test/data/external_table_p0/hive/test_transactional_hive.out and 
b/regression-test/data/external_table_p0/hive/test_transactional_hive.out differ
diff --git 
a/regression-test/data/external_table_p2/hive/test_hive_translation_insert_only.out
 
b/regression-test/data/external_table_p2/hive/test_hive_translation_insert_only.out
index e4bdb3fe32d..f43a630f4a3 100644
Binary files 
a/regression-test/data/external_table_p2/hive/test_hive_translation_insert_only.out
 and 
b/regression-test/data/external_table_p2/hive/test_hive_translation_insert_only.out
 differ
diff --git 
a/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy 
b/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy
index 4f7008ec172..a12ab8a4f78 100644
--- 
a/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy
+++ 
b/regression-test/suites/external_table_p0/hive/test_transactional_hive.groovy
@@ -114,6 +114,14 @@ suite("test_transactional_hive", 
"p0,external,hive,external_docker,external_dock
 
         }
     }
+    
+    def test_acid_count = {
+        qt_count_1 """ select count(*) from orc_full_acid; """ // 3 
+        qt_count_2 """ select count(*) from orc_full_acid_par; """  // 6
+        qt_count_3 """ select count(*) from orc_to_acid_compacted_tb; """ //4
+        qt_count_4 """ select count(*) from orc_acid_minor; """ //3
+        qt_count_5 """ select count(*) from orc_acid_major; """ //3
+    }
 
 
     String enabled = context.config.otherConfigs.get("enableHiveTest")
@@ -148,6 +156,10 @@ suite("test_transactional_hive", 
"p0,external,hive,external_docker,external_dock
             test_acid()
             test_acid_write()
 
+
+            test_acid_count()
+            
+            
             sql """drop catalog if exists ${catalog_name}"""
         } finally {
         }
diff --git 
a/regression-test/suites/external_table_p2/hive/test_hive_translation_insert_only.groovy
 
b/regression-test/suites/external_table_p2/hive/test_hive_translation_insert_only.groovy
index 758417c3237..f7135175152 100644
--- 
a/regression-test/suites/external_table_p2/hive/test_hive_translation_insert_only.groovy
+++ 
b/regression-test/suites/external_table_p2/hive/test_hive_translation_insert_only.groovy
@@ -45,6 +45,11 @@ suite("test_hive_translation_insert_only", 
"p2,external,hive,external_remote,ext
     qt_2 """ select * from parquet_insert_only_major order by id """ 
     qt_3 """ select * from orc_insert_only_minor order by id """ 
 
+    qt_count_1 """ select count(*) from text_insert_only """ //4 
+    qt_count_2 """ select count(*) from parquet_insert_only_major """ //5 
+    qt_count_3 """ select count(*) from orc_insert_only_minor """ //5
+
+
     sql """drop catalog ${hms_catalog_name};"""
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) 07/07: [fix](hive)fix select count(*) hive full acid tb opt error. (#46732)

Reply via email to