This is an automated email from the ASF dual-hosted git repository. lijibing pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 259d28407e4 [improvement](statistics)Enable estimate hive table row count using file size. (#37218) (#37694) 259d28407e4 is described below commit 259d28407e464acf7bfc868842396857d5ed88fd Author: Jibing-Li <64681310+jibing...@users.noreply.github.com> AuthorDate: Fri Jul 12 13:47:27 2024 +0800 [improvement](statistics)Enable estimate hive table row count using file size. (#37218) (#37694) backport: https://github.com/apache/doris/pull/37218 --- .../main/java/org/apache/doris/common/Config.java | 2 +- .../doris/datasource/hive/HMSExternalTable.java | 2 +- .../java/org/apache/doris/qe/GlobalVariable.java | 2 +- .../hive/test_hive_partition_column_analyze.groovy | 325 +++++++++++---------- 4 files changed, 172 insertions(+), 159 deletions(-) diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 1be7b871d68..995813e06e6 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -2282,7 +2282,7 @@ public class Config extends ConfigBase { @ConfField(mutable = true, masterOnly = false, description = { "Hive行数估算分区采样数", "Sample size for hive row count estimation."}) - public static int hive_stats_partition_sample_size = 3000; + public static int hive_stats_partition_sample_size = 30; @ConfField(mutable = true, masterOnly = true, description = { "启用Hive分桶表", diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java index 03eac33ab53..76f2f6c4b39 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java @@ -875,7 +875,7 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI } int totalPartitionSize = partitionValues == null ? 1 : partitionValues.getIdToPartitionItem().size(); - if (samplePartitionSize < totalPartitionSize) { + if (samplePartitionSize != 0 && samplePartitionSize < totalPartitionSize) { totalSize = totalSize * totalPartitionSize / samplePartitionSize; } return totalSize / estimatedRowSize; diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java index 6eac0c2b815..abe5d452a8f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/GlobalVariable.java @@ -146,7 +146,7 @@ public final class GlobalVariable { + "Getting file list may be a time-consuming operation. " + "If you don't need to estimate the number of rows in the table " + "or it affects performance, you can disable this feature."}) - public static boolean enable_get_row_count_from_file_list = false; + public static boolean enable_get_row_count_from_file_list = true; @VariableMgr.VarAttr(name = READ_ONLY, flag = VariableMgr.GLOBAL, description = {"仅用于兼容MySQL生态,暂无实际意义", diff --git a/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy b/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy index 7704b68e399..9accfcaa445 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_partition_column_analyze.groovy @@ -18,7 +18,13 @@ suite("test_hive_partition_column_analyze", "p0,external,hive,external_docker,external_docker_hive") { String enabled = context.config.otherConfigs.get("enableHiveTest") if (enabled == null || !enabled.equalsIgnoreCase("true")) { - logger.info("diable Hive test.") + logger.info("disable Hive test.") + return; + } + + def enable = sql """show variables like "enable_partition_analyze" """ + if (enable.size() != 1 || enable[0][1].equalsIgnoreCase("false")) { + logger.info("partition analyze disabled. " + enable) return; } @@ -36,161 +42,168 @@ suite("test_hive_partition_column_analyze", "p0,external,hive,external_docker,ex """ logger.info("catalog " + catalog_name + " created") - try { - sql """set global enable_get_row_count_from_file_list=true""" - - sql """switch ${catalog_name};""" - logger.info("switched to catalog " + catalog_name) - - sql """analyze table ${catalog_name}.partition_type.tinyint_partition (tinyint_part) with sync""" - sql """analyze table ${catalog_name}.partition_type.smallint_partition (smallint_part) with sync""" - sql """analyze table ${catalog_name}.partition_type.int_partition (int_part) with sync""" - sql """analyze table ${catalog_name}.partition_type.bigint_partition (bigint_part) with sync""" - sql """analyze table ${catalog_name}.partition_type.char_partition (char_part) with sync""" - sql """analyze table ${catalog_name}.partition_type.varchar_partition (varchar_part) with sync""" - sql """analyze table ${catalog_name}.partition_type.string_partition (string_part) with sync""" - sql """analyze table ${catalog_name}.partition_type.date_partition (date_part) with sync""" - sql """analyze table ${catalog_name}.partition_type.float_partition (float_part) with sync""" - sql """analyze table ${catalog_name}.partition_type.double_partition (double_part) with sync""" - sql """analyze table ${catalog_name}.partition_type.decimal_partition (decimal_part) with sync""" - - sql """use partition_type;""" - - result = sql """show column stats tinyint_partition (tinyint_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "tinyint_part") - assertEquals(result[0][2], "141474.0") - assertEquals(result[0][3], "100.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "141474.0") - assertEquals(result[0][6], "1.0") - assertEquals(result[0][7], "1") - assertEquals(result[0][8], "100") - - result = sql """show column stats smallint_partition (smallint_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "smallint_part") - assertEquals(result[0][2], "141474.0") - assertEquals(result[0][3], "100.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "282948.0") - assertEquals(result[0][6], "2.0") - assertEquals(result[0][7], "1") - assertEquals(result[0][8], "100") - - result = sql """show column stats int_partition (int_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "int_part") - assertEquals(result[0][2], "141474.0") - assertEquals(result[0][3], "100.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "565896.0") - assertEquals(result[0][6], "4.0") - assertEquals(result[0][7], "1") - assertEquals(result[0][8], "100") - - result = sql """show column stats bigint_partition (bigint_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "bigint_part") - assertEquals(result[0][2], "141474.0") - assertEquals(result[0][3], "100.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "1131792.0") - assertEquals(result[0][6], "8.0") - assertEquals(result[0][7], "1") - assertEquals(result[0][8], "100") - - result = sql """show column stats char_partition (char_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "char_part") - assertEquals(result[0][2], "141474.0") - assertEquals(result[0][3], "101.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "2829480.0") - assertEquals(result[0][6], "20.0") - assertEquals(result[0][7], "\'1 \'") - assertEquals(result[0][8], "\'a \'") - - result = sql """show column stats varchar_partition (varchar_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "varchar_part") - assertEquals(result[0][2], "141474.0") - assertEquals(result[0][3], "100.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "271630.0") - assertEquals(result[0][6], "1.9199994345250717") - assertEquals(result[0][7], "\'1\'") - assertEquals(result[0][8], "\'99\'") - - result = sql """show column stats string_partition (string_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "string_part") - assertEquals(result[0][2], "141474.0") - assertEquals(result[0][3], "100.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "271630.0") - assertEquals(result[0][6], "1.9199994345250717") - assertEquals(result[0][7], "\'1\'") - assertEquals(result[0][8], "\'99\'") - - result = sql """show column stats date_partition (date_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "date_part") - assertEquals(result[0][2], "141474.0") - assertEquals(result[0][3], "100.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "565896.0") - assertEquals(result[0][6], "4.0") - assertEquals(result[0][7], "\'2001-10-12\'") - assertEquals(result[0][8], "\'2100-10-12\'") - - result = sql """show column stats float_partition (float_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "float_part") - assertEquals(result[0][2], "141474.0") - assertEquals(result[0][3], "100.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "565896.0") - assertEquals(result[0][6], "4.0") - assertEquals(result[0][7], "296.3103") - assertEquals(result[0][8], "32585.627") - - result = sql """show column stats double_partition (double_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "double_part") - assertEquals(result[0][2], "141474.0") - assertEquals(result[0][3], "100.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "1131792.0") - assertEquals(result[0][6], "8.0") - assertEquals(result[0][7], "115.14474") - assertEquals(result[0][8], "32761.14458") - - result = sql """show column stats decimal_partition (decimal_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "decimal_part") - assertEquals(result[0][2], "141474.0") - assertEquals(result[0][3], "100.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "1131792.0") - assertEquals(result[0][6], "8.0") - assertEquals(result[0][7], "243.2868") - assertEquals(result[0][8], "32527.1543") - - sql """analyze table ${catalog_name}.partition_type.decimal_partition (decimal_part) with sync with sql""" - result = sql """show column stats decimal_partition (decimal_part)""" - assertEquals(result.size(), 1) - assertEquals(result[0][0], "decimal_part") - assertEquals(result[0][2], "100000.0") - assertEquals(result[0][3], "100.0") - assertEquals(result[0][4], "0.0") - assertEquals(result[0][5], "800000.0") - assertEquals(result[0][6], "8.0") - assertEquals(result[0][7], "243.2868") - assertEquals(result[0][8], "32527.1543") - } finally { - sql """set global enable_get_row_count_from_file_list=false""" - } + sql """set global enable_get_row_count_from_file_list=true""" + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + + sql """analyze table ${catalog_name}.partition_type.tinyint_partition (tinyint_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.smallint_partition (smallint_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.int_partition (int_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.bigint_partition (bigint_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.char_partition (char_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.varchar_partition (varchar_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.string_partition (string_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.date_partition (date_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.float_partition (float_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.double_partition (double_part) with sync""" + sql """analyze table ${catalog_name}.partition_type.decimal_partition (decimal_part) with sync""" + + sql """use partition_type;""" + + result = sql """show column stats tinyint_partition (tinyint_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "tinyint_part") + assertEquals(result[0][1], "tinyint_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "100000.0") + assertEquals(result[0][6], "1.0") + assertEquals(result[0][7], "1") + assertEquals(result[0][8], "100") + + result = sql """show column stats smallint_partition (smallint_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "smallint_part") + assertEquals(result[0][1], "smallint_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "200000.0") + assertEquals(result[0][6], "2.0") + assertEquals(result[0][7], "1") + assertEquals(result[0][8], "100") + + result = sql """show column stats int_partition (int_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "int_part") + assertEquals(result[0][1], "int_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "400000.0") + assertEquals(result[0][6], "4.0") + assertEquals(result[0][7], "1") + assertEquals(result[0][8], "100") + + result = sql """show column stats bigint_partition (bigint_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "bigint_part") + assertEquals(result[0][1], "bigint_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "800000.0") + assertEquals(result[0][6], "8.0") + assertEquals(result[0][7], "1") + assertEquals(result[0][8], "100") + + result = sql """show column stats char_partition (char_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "char_part") + assertEquals(result[0][1], "char_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "2000000.0") + assertEquals(result[0][6], "20.0") + assertEquals(result[0][7], "\'1 \'") + assertEquals(result[0][8], "\'99 \'") + + result = sql """show column stats varchar_partition (varchar_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "varchar_part") + assertEquals(result[0][1], "varchar_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "192000.0") + assertEquals(result[0][6], "1.92") + assertEquals(result[0][7], "\'1\'") + assertEquals(result[0][8], "\'99\'") + + result = sql """show column stats string_partition (string_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "string_part") + assertEquals(result[0][1], "string_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "192000.0") + assertEquals(result[0][6], "1.92") + assertEquals(result[0][7], "\'1\'") + assertEquals(result[0][8], "\'99\'") + + result = sql """show column stats date_partition (date_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "date_part") + assertEquals(result[0][1], "date_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "400000.0") + assertEquals(result[0][6], "4.0") + assertEquals(result[0][7], "\'2001-10-12\'") + assertEquals(result[0][8], "\'2100-10-12\'") + + result = sql """show column stats float_partition (float_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "float_part") + assertEquals(result[0][1], "float_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "400000.0") + assertEquals(result[0][6], "4.0") + assertEquals(result[0][7], "296.3103") + assertEquals(result[0][8], "32585.627") + + result = sql """show column stats double_partition (double_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "double_part") + assertEquals(result[0][1], "double_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "800000.0") + assertEquals(result[0][6], "8.0") + assertEquals(result[0][7], "115.14474") + assertEquals(result[0][8], "32761.14458") + + result = sql """show column stats decimal_partition (decimal_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "decimal_part") + assertEquals(result[0][1], "decimal_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "800000.0") + assertEquals(result[0][6], "8.0") + assertEquals(result[0][7], "243.2868") + assertEquals(result[0][8], "32527.1543") + + sql """analyze table ${catalog_name}.partition_type.decimal_partition (decimal_part) with sync with sql""" + result = sql """show column stats decimal_partition (decimal_part)""" + assertEquals(result.size(), 1) + assertEquals(result[0][0], "decimal_part") + assertEquals(result[0][1], "decimal_partition") + assertEquals(result[0][2], "100000.0") + assertEquals(result[0][3], "100.0") + assertEquals(result[0][4], "0.0") + assertEquals(result[0][5], "800000.0") + assertEquals(result[0][6], "8.0") + assertEquals(result[0][7], "243.2868") + assertEquals(result[0][8], "32527.1543") sql """drop catalog ${catalog_name}""" } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org