This is an automated email from the ASF dual-hosted git repository.

lijibing pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 259d28407e4 [improvement](statistics)Enable estimate hive table row 
count using file size. (#37218) (#37694)
259d28407e4 is described below

commit 259d28407e464acf7bfc868842396857d5ed88fd
Author: Jibing-Li <64681310+jibing...@users.noreply.github.com>
AuthorDate: Fri Jul 12 13:47:27 2024 +0800

    [improvement](statistics)Enable estimate hive table row count using file 
size. (#37218) (#37694)
    
    backport: https://github.com/apache/doris/pull/37218
---
 .../main/java/org/apache/doris/common/Config.java  |   2 +-
 .../doris/datasource/hive/HMSExternalTable.java    |   2 +-
 .../java/org/apache/doris/qe/GlobalVariable.java   |   2 +-
 .../hive/test_hive_partition_column_analyze.groovy | 325 +++++++++++----------
 4 files changed, 172 insertions(+), 159 deletions(-)

diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java 
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 1be7b871d68..995813e06e6 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -2282,7 +2282,7 @@ public class Config extends ConfigBase {
     @ConfField(mutable = true, masterOnly = false, description = {
         "Hive行数估算分区采样数",
         "Sample size for hive row count estimation."})
-    public static int hive_stats_partition_sample_size = 3000;
+    public static int hive_stats_partition_sample_size = 30;
 
     @ConfField(mutable = true, masterOnly = true, description = {
             "启用Hive分桶表",
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
index 03eac33ab53..76f2f6c4b39 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
@@ -875,7 +875,7 @@ public class HMSExternalTable extends ExternalTable 
implements MTMVRelatedTableI
         }
 
         int totalPartitionSize = partitionValues == null ? 1 : 
partitionValues.getIdToPartitionItem().size();
-        if (samplePartitionSize < totalPartitionSize) {
+        if (samplePartitionSize != 0 && samplePartitionSize < 
totalPartitionSize) {
             totalSize = totalSize * totalPartitionSize / samplePartitionSize;
         }
         return totalSize / estimatedRowSize;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java
index 6eac0c2b815..abe5d452a8f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java
@@ -146,7 +146,7 @@ public final class GlobalVariable {
                             + "Getting file list may be a time-consuming 
operation. "
                             + "If you don't need to estimate the number of 
rows in the table "
                             + "or it affects performance, you can disable this 
feature."})
-    public static boolean enable_get_row_count_from_file_list = false;
+    public static boolean enable_get_row_count_from_file_list = true;
 
     @VariableMgr.VarAttr(name = READ_ONLY, flag = VariableMgr.GLOBAL,
             description = {"仅用于兼容MySQL生态,暂无实际意义",
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy
 
b/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy
index 7704b68e399..9accfcaa445 100644
--- 
a/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy
@@ -18,7 +18,13 @@
 suite("test_hive_partition_column_analyze", 
"p0,external,hive,external_docker,external_docker_hive") {
     String enabled = context.config.otherConfigs.get("enableHiveTest")
     if (enabled == null || !enabled.equalsIgnoreCase("true")) {
-        logger.info("diable Hive test.")
+        logger.info("disable Hive test.")
+        return;
+    }
+
+    def enable = sql """show variables like "enable_partition_analyze" """
+    if (enable.size() != 1 || enable[0][1].equalsIgnoreCase("false")) {
+        logger.info("partition analyze disabled. " + enable)
         return;
     }
 
@@ -36,161 +42,168 @@ suite("test_hive_partition_column_analyze", 
"p0,external,hive,external_docker,ex
         """
         logger.info("catalog " + catalog_name + " created")
 
-        try {
-            sql """set global enable_get_row_count_from_file_list=true"""
-
-            sql """switch ${catalog_name};"""
-            logger.info("switched to catalog " + catalog_name)
-
-            sql """analyze table 
${catalog_name}.partition_type.tinyint_partition (tinyint_part) with sync"""
-            sql """analyze table 
${catalog_name}.partition_type.smallint_partition (smallint_part) with sync"""
-            sql """analyze table ${catalog_name}.partition_type.int_partition 
(int_part) with sync"""
-            sql """analyze table 
${catalog_name}.partition_type.bigint_partition (bigint_part) with sync"""
-            sql """analyze table ${catalog_name}.partition_type.char_partition 
(char_part) with sync"""
-            sql """analyze table 
${catalog_name}.partition_type.varchar_partition (varchar_part) with sync"""
-            sql """analyze table 
${catalog_name}.partition_type.string_partition (string_part) with sync"""
-            sql """analyze table ${catalog_name}.partition_type.date_partition 
(date_part) with sync"""
-            sql """analyze table 
${catalog_name}.partition_type.float_partition (float_part) with sync"""
-            sql """analyze table 
${catalog_name}.partition_type.double_partition (double_part) with sync"""
-            sql """analyze table 
${catalog_name}.partition_type.decimal_partition (decimal_part) with sync"""
-
-            sql """use partition_type;"""
-
-            result = sql """show column stats tinyint_partition 
(tinyint_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "tinyint_part")
-            assertEquals(result[0][2], "141474.0")
-            assertEquals(result[0][3], "100.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "141474.0")
-            assertEquals(result[0][6], "1.0")
-            assertEquals(result[0][7], "1")
-            assertEquals(result[0][8], "100")
-
-            result = sql """show column stats smallint_partition 
(smallint_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "smallint_part")
-            assertEquals(result[0][2], "141474.0")
-            assertEquals(result[0][3], "100.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "282948.0")
-            assertEquals(result[0][6], "2.0")
-            assertEquals(result[0][7], "1")
-            assertEquals(result[0][8], "100")
-
-            result = sql """show column stats int_partition (int_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "int_part")
-            assertEquals(result[0][2], "141474.0")
-            assertEquals(result[0][3], "100.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "565896.0")
-            assertEquals(result[0][6], "4.0")
-            assertEquals(result[0][7], "1")
-            assertEquals(result[0][8], "100")
-
-            result = sql """show column stats bigint_partition (bigint_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "bigint_part")
-            assertEquals(result[0][2], "141474.0")
-            assertEquals(result[0][3], "100.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "1131792.0")
-            assertEquals(result[0][6], "8.0")
-            assertEquals(result[0][7], "1")
-            assertEquals(result[0][8], "100")
-
-            result = sql """show column stats char_partition (char_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "char_part")
-            assertEquals(result[0][2], "141474.0")
-            assertEquals(result[0][3], "101.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "2829480.0")
-            assertEquals(result[0][6], "20.0")
-            assertEquals(result[0][7], "\'1                   \'")
-            assertEquals(result[0][8], "\'a                   \'")
-
-            result = sql """show column stats varchar_partition 
(varchar_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "varchar_part")
-            assertEquals(result[0][2], "141474.0")
-            assertEquals(result[0][3], "100.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "271630.0")
-            assertEquals(result[0][6], "1.9199994345250717")
-            assertEquals(result[0][7], "\'1\'")
-            assertEquals(result[0][8], "\'99\'")
-
-            result = sql """show column stats string_partition (string_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "string_part")
-            assertEquals(result[0][2], "141474.0")
-            assertEquals(result[0][3], "100.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "271630.0")
-            assertEquals(result[0][6], "1.9199994345250717")
-            assertEquals(result[0][7], "\'1\'")
-            assertEquals(result[0][8], "\'99\'")
-
-            result = sql """show column stats date_partition (date_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "date_part")
-            assertEquals(result[0][2], "141474.0")
-            assertEquals(result[0][3], "100.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "565896.0")
-            assertEquals(result[0][6], "4.0")
-            assertEquals(result[0][7], "\'2001-10-12\'")
-            assertEquals(result[0][8], "\'2100-10-12\'")
-
-            result = sql """show column stats float_partition (float_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "float_part")
-            assertEquals(result[0][2], "141474.0")
-            assertEquals(result[0][3], "100.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "565896.0")
-            assertEquals(result[0][6], "4.0")
-            assertEquals(result[0][7], "296.3103")
-            assertEquals(result[0][8], "32585.627")
-
-            result = sql """show column stats double_partition (double_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "double_part")
-            assertEquals(result[0][2], "141474.0")
-            assertEquals(result[0][3], "100.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "1131792.0")
-            assertEquals(result[0][6], "8.0")
-            assertEquals(result[0][7], "115.14474")
-            assertEquals(result[0][8], "32761.14458")
-
-            result = sql """show column stats decimal_partition 
(decimal_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "decimal_part")
-            assertEquals(result[0][2], "141474.0")
-            assertEquals(result[0][3], "100.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "1131792.0")
-            assertEquals(result[0][6], "8.0")
-            assertEquals(result[0][7], "243.2868")
-            assertEquals(result[0][8], "32527.1543")
-
-            sql """analyze table 
${catalog_name}.partition_type.decimal_partition (decimal_part) with sync with 
sql"""
-            result = sql """show column stats decimal_partition 
(decimal_part)"""
-            assertEquals(result.size(), 1)
-            assertEquals(result[0][0], "decimal_part")
-            assertEquals(result[0][2], "100000.0")
-            assertEquals(result[0][3], "100.0")
-            assertEquals(result[0][4], "0.0")
-            assertEquals(result[0][5], "800000.0")
-            assertEquals(result[0][6], "8.0")
-            assertEquals(result[0][7], "243.2868")
-            assertEquals(result[0][8], "32527.1543")
-        } finally {
-            sql """set global enable_get_row_count_from_file_list=false"""
-        }
+        sql """set global enable_get_row_count_from_file_list=true"""
+        sql """switch ${catalog_name};"""
+        logger.info("switched to catalog " + catalog_name)
+
+        sql """analyze table ${catalog_name}.partition_type.tinyint_partition 
(tinyint_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.smallint_partition 
(smallint_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.int_partition 
(int_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.bigint_partition 
(bigint_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.char_partition 
(char_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.varchar_partition 
(varchar_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.string_partition 
(string_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.date_partition 
(date_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.float_partition 
(float_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.double_partition 
(double_part) with sync"""
+        sql """analyze table ${catalog_name}.partition_type.decimal_partition 
(decimal_part) with sync"""
+
+        sql """use partition_type;"""
+
+        result = sql """show column stats tinyint_partition (tinyint_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "tinyint_part")
+        assertEquals(result[0][1], "tinyint_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "100000.0")
+        assertEquals(result[0][6], "1.0")
+        assertEquals(result[0][7], "1")
+        assertEquals(result[0][8], "100")
+
+        result = sql """show column stats smallint_partition (smallint_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "smallint_part")
+        assertEquals(result[0][1], "smallint_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "200000.0")
+        assertEquals(result[0][6], "2.0")
+        assertEquals(result[0][7], "1")
+        assertEquals(result[0][8], "100")
+
+        result = sql """show column stats int_partition (int_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "int_part")
+        assertEquals(result[0][1], "int_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "400000.0")
+        assertEquals(result[0][6], "4.0")
+        assertEquals(result[0][7], "1")
+        assertEquals(result[0][8], "100")
+
+        result = sql """show column stats bigint_partition (bigint_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "bigint_part")
+        assertEquals(result[0][1], "bigint_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "800000.0")
+        assertEquals(result[0][6], "8.0")
+        assertEquals(result[0][7], "1")
+        assertEquals(result[0][8], "100")
+
+        result = sql """show column stats char_partition (char_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "char_part")
+        assertEquals(result[0][1], "char_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "2000000.0")
+        assertEquals(result[0][6], "20.0")
+        assertEquals(result[0][7], "\'1                   \'")
+        assertEquals(result[0][8], "\'99                  \'")
+
+        result = sql """show column stats varchar_partition (varchar_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "varchar_part")
+        assertEquals(result[0][1], "varchar_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "192000.0")
+        assertEquals(result[0][6], "1.92")
+        assertEquals(result[0][7], "\'1\'")
+        assertEquals(result[0][8], "\'99\'")
+
+        result = sql """show column stats string_partition (string_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "string_part")
+        assertEquals(result[0][1], "string_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "192000.0")
+        assertEquals(result[0][6], "1.92")
+        assertEquals(result[0][7], "\'1\'")
+        assertEquals(result[0][8], "\'99\'")
+
+        result = sql """show column stats date_partition (date_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "date_part")
+        assertEquals(result[0][1], "date_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "400000.0")
+        assertEquals(result[0][6], "4.0")
+        assertEquals(result[0][7], "\'2001-10-12\'")
+        assertEquals(result[0][8], "\'2100-10-12\'")
+
+        result = sql """show column stats float_partition (float_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "float_part")
+        assertEquals(result[0][1], "float_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "400000.0")
+        assertEquals(result[0][6], "4.0")
+        assertEquals(result[0][7], "296.3103")
+        assertEquals(result[0][8], "32585.627")
+
+        result = sql """show column stats double_partition (double_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "double_part")
+        assertEquals(result[0][1], "double_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "800000.0")
+        assertEquals(result[0][6], "8.0")
+        assertEquals(result[0][7], "115.14474")
+        assertEquals(result[0][8], "32761.14458")
+
+        result = sql """show column stats decimal_partition (decimal_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "decimal_part")
+        assertEquals(result[0][1], "decimal_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "800000.0")
+        assertEquals(result[0][6], "8.0")
+        assertEquals(result[0][7], "243.2868")
+        assertEquals(result[0][8], "32527.1543")
+
+        sql """analyze table ${catalog_name}.partition_type.decimal_partition 
(decimal_part) with sync with sql"""
+        result = sql """show column stats decimal_partition (decimal_part)"""
+        assertEquals(result.size(), 1)
+        assertEquals(result[0][0], "decimal_part")
+        assertEquals(result[0][1], "decimal_partition")
+        assertEquals(result[0][2], "100000.0")
+        assertEquals(result[0][3], "100.0")
+        assertEquals(result[0][4], "0.0")
+        assertEquals(result[0][5], "800000.0")
+        assertEquals(result[0][6], "8.0")
+        assertEquals(result[0][7], "243.2868")
+        assertEquals(result[0][8], "32527.1543")
         sql """drop catalog ${catalog_name}"""
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to