This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 557112e55dd697673868a4188cbe2aa2d29589ce Author: Jibing-Li <64681310+jibing...@users.noreply.github.com> AuthorDate: Thu Oct 19 18:00:55 2023 +0800 [improvement](statistics)Set min max to NULL when collect stats with sample (#25593) 1. To avoid misleading of inaccurate min max stats, set the stats value to NULL while using sample to collect stats. 2. Fix NDV_SAMPLE_TEMPLATE typo, it shouldn't contain row count related contents. --- .../apache/doris/statistics/BaseAnalysisTask.java | 21 ++++- .../apache/doris/statistics/HMSAnalysisTask.java | 7 +- .../apache/doris/statistics/OlapAnalysisTask.java | 6 +- .../hive/test_hive_sample_statistic.groovy | 99 ---------------------- .../hive/test_hive_statistic.groovy | 34 ++++---- .../hive/test_hive_statistic_sample.groovy | 40 ++++----- 6 files changed, 64 insertions(+), 143 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 65791b6edab..ad74266a7c3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -46,8 +46,7 @@ public abstract class BaseAnalysisTask { protected static final String NDV_MULTIPLY_THRESHOLD = "0.3"; - protected static final String NDV_SAMPLE_TEMPLATE = "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, " - + "case when NDV(`${colName}`)/count('${colName}') < " + protected static final String NDV_SAMPLE_TEMPLATE = "case when NDV(`${colName}`)/count('${colName}') < " + NDV_MULTIPLY_THRESHOLD + " then NDV(`${colName}`) " + "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, " @@ -239,6 +238,24 @@ public abstract class BaseAnalysisTask { return "COUNT(1) * " + column.getType().getSlotSize(); } + // Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan. + protected String getMinFunction() { + if (tableSample == null) { + return "MIN(`${colName}`) "; + } else { + return "NULL "; + } + } + + // Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan. + protected String getMaxFunction() { + if (tableSample == null) { + return "MAX(`${colName}`) "; + } else { + return "NULL "; + } + } + protected TableSample getTableSample() { if (info.forceFull) { return null; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 217eb89383c..188665645c3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -61,10 +61,11 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "${idxId} AS idx_id, " + "'${colId}' AS col_id, " + "NULL AS part_id, " + + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, " + NDV_SAMPLE_TEMPLATE + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, " - + "MIN(`${colName}`) AS min, " - + "MAX(`${colName}`) AS max, " + + "${minFunction} AS min, " + + "${maxFunction} AS max, " + "${dataSizeFunction} * ${scaleFactor} AS data_size, " + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; @@ -177,6 +178,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask { sb.append(ANALYZE_TABLE_TEMPLATE); Map<String, String> params = buildStatsParams("NULL"); params.put("dataSizeFunction", getDataSizeFunction(col)); + params.put("minFunction", getMinFunction()); + params.put("maxFunction", getMaxFunction()); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(sb.toString()); executeInsertSql(sql); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 7be6d1cc6ee..2df7b9c358d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -81,11 +81,11 @@ public class OlapAnalysisTask extends BaseAnalysisTask { + "${idxId} AS idx_id, " + "'${colId}' AS col_id, " + "NULL AS part_id, " - + "COUNT(1) * ${scaleFactor} AS row_count, " + + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, " + NDV_SAMPLE_TEMPLATE + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor} AS null_count, " - + "MIN(`${colName}`) AS min, " - + "MAX(`${colName}`) AS max, " + + "NULL AS min, " + + "NULL AS max, " + "${dataSizeFunction} * ${scaleFactor} AS data_size, " + "NOW() " + "FROM `${dbName}`.`${tblName}`" diff --git a/regression-test/suites/external_table_p2/hive/test_hive_sample_statistic.groovy b/regression-test/suites/external_table_p2/hive/test_hive_sample_statistic.groovy deleted file mode 100644 index c2a21e3994b..00000000000 --- a/regression-test/suites/external_table_p2/hive/test_hive_sample_statistic.groovy +++ /dev/null @@ -1,99 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -suite("test_hive_sample_statistic", "p2,external,hive,external_remote,external_remote_hive") { - String enabled = context.config.otherConfigs.get("enableExternalHiveTest") - if (enabled != null && enabled.equalsIgnoreCase("true")) { - String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") - String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") - String catalog_name = "test_hive_sample_statistic" - sql """drop catalog if exists ${catalog_name};""" - sql """ - create catalog if not exists ${catalog_name} properties ( - 'type'='hms', - 'hadoop.username' = 'hadoop', - 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' - ); - """ - logger.info("catalog " + catalog_name + " created") - - sql """use ${catalog_name}.tpch_1000_parquet""" - sql """analyze table part with sample percent 10 with sync;""" - - def result = sql """show table stats part""" - assertTrue(result.size() == 1) - assertTrue(Long.parseLong(result[0][2]) >= 200000000) - assertTrue(Long.parseLong(result[0][2]) < 220000000) - - def ctlId - result = sql """show proc '/catalogs'""" - - for (int i = 0; i < result.size(); i++) { - if (result[i][1] == catalog_name) { - ctlId = result[i][0] - } - } - - result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_partkey'""" - assertTrue(result.size() == 1) - assertTrue(result[0][0] >= 200000000) - assertTrue(result[0][0] < 220000000) - - result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_name'""" - assertTrue(result.size() == 1) - assertTrue(result[0][0] >= 200000000) - assertTrue(result[0][0] < 220000000) - - result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_mfgr'""" - assertTrue(result.size() == 1) - assertTrue(result[0][0] >= 200000000) - assertTrue(result[0][0] < 220000000) - - result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_brand'""" - assertTrue(result.size() == 1) - assertTrue(result[0][0] >= 200000000) - assertTrue(result[0][0] < 220000000) - - result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_type'""" - assertTrue(result.size() == 1) - assertTrue(result[0][0] >= 200000000) - assertTrue(result[0][0] < 220000000) - - result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_size'""" - assertTrue(result.size() == 1) - assertTrue(result[0][0] >= 200000000) - assertTrue(result[0][0] < 220000000) - - result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_container'""" - assertTrue(result.size() == 1) - assertTrue(result[0][0] >= 200000000) - assertTrue(result[0][0] < 220000000) - - result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_retailprice'""" - assertTrue(result.size() == 1) - assertTrue(result[0][0] >= 200000000) - assertTrue(result[0][0] < 220000000) - - result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_comment'""" - assertTrue(result.size() == 1) - assertTrue(result[0][0] >= 200000000) - assertTrue(result[0][0] < 220000000) - - sql """drop catalog ${catalog_name}"""; - } -} - diff --git a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy index 49142b12e36..38d6fdf7082 100644 --- a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy +++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy @@ -44,7 +44,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "46.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "1") assertTrue(result[0][7] == "50") @@ -55,7 +55,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "26.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "1") assertTrue(result[0][7] == "98") @@ -66,7 +66,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "7.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "1") assertTrue(result[0][7] == "7") @@ -77,7 +77,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "26.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "67423") assertTrue(result[0][7] == "2735521") @@ -88,7 +88,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "100.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "2250") assertTrue(result[0][7] == "989601") @@ -99,7 +99,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "100.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "4167") assertTrue(result[0][7] == "195845") @@ -110,7 +110,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "26.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "19920221") assertTrue(result[0][7] == "19980721") @@ -121,7 +121,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "5.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "888.8000000000001") + assertTrue(result[0][4] == "880.0") assertTrue(result[0][5] == "8.8") assertTrue(result[0][6] == "'1-URGENT'") assertTrue(result[0][7] == "'5-LOW'") @@ -132,7 +132,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "1.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "0") assertTrue(result[0][7] == "0") @@ -143,7 +143,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "100.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "104300") assertTrue(result[0][7] == "9066094") @@ -154,7 +154,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "26.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "3428256") assertTrue(result[0][7] == "36771805") @@ -165,7 +165,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "11.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "0") assertTrue(result[0][7] == "10") @@ -176,7 +176,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "100.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "101171") assertTrue(result[0][7] == "8703450") @@ -187,7 +187,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "100.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "58023") assertTrue(result[0][7] == "121374") @@ -198,7 +198,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "9.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "0") assertTrue(result[0][7] == "8") @@ -209,7 +209,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "95.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "404.0") + assertTrue(result[0][4] == "400.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "19920515") assertTrue(result[0][7] == "19981016") @@ -220,7 +220,7 @@ suite("test_hive_statistic", "p2") { assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "7.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "425.21") + assertTrue(result[0][4] == "421.0") assertTrue(result[0][5] == "4.21") assertTrue(result[0][6] == "'AIR'") assertTrue(result[0][7] == "'TRUCK'") diff --git a/regression-test/suites/external_table_p2/hive/test_hive_statistic_sample.groovy b/regression-test/suites/external_table_p2/hive/test_hive_statistic_sample.groovy index 960bec31df5..212ea2a3f9f 100644 --- a/regression-test/suites/external_table_p2/hive/test_hive_statistic_sample.groovy +++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic_sample.groovy @@ -42,8 +42,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r assertTrue(result[0][3] == "0.0") assertTrue(result[0][4] == "20.0") assertTrue(result[0][5] == "4.0") - assertTrue(result[0][6] == "0") - assertTrue(result[0][7] == "4") + assertTrue(result[0][6] == "N/A") + assertTrue(result[0][7] == "N/A") result = sql """show column stats region (r_name)""" assertTrue(result.size() == 1) @@ -53,8 +53,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r assertTrue(result[0][3] == "0.0") assertTrue(result[0][4] == "34.0") assertTrue(result[0][5] == "6.8") - assertTrue(result[0][6] == "\'AFRICA\'") - assertTrue(result[0][7] == "\'MIDDLE EAST\'") + assertTrue(result[0][6] == "N/A") + assertTrue(result[0][7] == "N/A") result = sql """show column stats region (r_comment)""" assertTrue(result.size() == 1) @@ -64,8 +64,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r assertTrue(result[0][3] == "0.0") assertTrue(result[0][4] == "330.0") assertTrue(result[0][5] == "66.0") - assertTrue(result[0][6] == "\'ges. thinly even pinto beans ca\'") - assertTrue(result[0][7] == "\'uickly special accounts cajole carefully blithely close requests. carefully final asymptotes haggle furiousl\'") + assertTrue(result[0][6] == "N/A") + assertTrue(result[0][7] == "N/A") result = sql """show column stats supplier (s_suppkey)""" assertTrue(result.size() == 1) @@ -75,8 +75,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r assertTrue(result[0][3] == "0.0") assertTrue(result[0][4] == "3.9995194E7") assertTrue(result[0][5] == "3.9999997999759773") - assertTrue(result[0][6] == "1885331") - assertTrue(result[0][7] == "9395153") + assertTrue(result[0][6] == "N/A") + assertTrue(result[0][7] == "N/A") result = sql """show column stats supplier (s_name)""" assertTrue(result.size() == 1) @@ -86,8 +86,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r assertTrue(result[0][3] == "0.0") assertTrue(result[0][4] == "1.79978374E8") assertTrue(result[0][5] == "17.999999199903908") - assertTrue(result[0][6] == "\'Supplier#001885331\'") - assertTrue(result[0][7] == "\'Supplier#009395153\'") + assertTrue(result[0][6] == "N/A") + assertTrue(result[0][7] == "N/A") result = sql """show column stats supplier (s_address)""" assertTrue(result.size() == 1) @@ -97,8 +97,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r assertTrue(result[0][3] == "0.0") assertTrue(result[0][4] == "2.50070604E8") assertTrue(result[0][5] == "25.010064108699456") - assertTrue(result[0][6] == "\' E,WAW2ZEx\'") - assertTrue(result[0][7] == "\'zzzw X3bpxu,OCpzgv6BdyMVMKzaB1DbH\'") + assertTrue(result[0][6] == "N/A") + assertTrue(result[0][7] == "N/A") result = sql """show column stats supplier (s_nationkey)""" assertTrue(result.size() == 1) @@ -108,8 +108,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r assertTrue(result[0][3] == "0.0") assertTrue(result[0][4] == "3.9995194E7") assertTrue(result[0][5] == "3.9999997999759773") - assertTrue(result[0][6] == "0") - assertTrue(result[0][7] == "24") + assertTrue(result[0][6] == "N/A") + assertTrue(result[0][7] == "N/A") result = sql """show column stats supplier (s_phone)""" assertTrue(result.size() == 1) @@ -119,8 +119,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r assertTrue(result[0][3] == "0.0") assertTrue(result[0][4] == "1.49981978E8") assertTrue(result[0][5] == "14.99999929991592") - assertTrue(result[0][6] == "\'10-100-128-4513\'") - assertTrue(result[0][7] == "\'34-999-967-7296\'") + assertTrue(result[0][6] == "N/A") + assertTrue(result[0][7] == "N/A") result = sql """show column stats supplier (s_acctbal)""" assertTrue(result.size() == 1) @@ -130,8 +130,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r assertTrue(result[0][3] == "0.0") assertTrue(result[0][4] == "7.9990388E7") assertTrue(result[0][5] == "7.999999599951955") - assertTrue(result[0][6] == "-999.99") - assertTrue(result[0][7] == "9999.99") + assertTrue(result[0][6] == "N/A") + assertTrue(result[0][7] == "N/A") result = sql """show column stats supplier (s_comment)""" assertTrue(result.size() == 1) @@ -141,8 +141,8 @@ suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_r assertTrue(result[0][3] == "0.0") assertTrue(result[0][4] == "6.24883849E8") assertTrue(result[0][5] == "62.49589065646784") - assertTrue(result[0][6] == "\' Customer across the pinto beans. pinRecommends\'") - assertTrue(result[0][7] == "\'zzle? express, regular foxes haggle final ac\'") + assertTrue(result[0][6] == "N/A") + assertTrue(result[0][7] == "N/A") sql """drop catalog ${catalog_name}""" } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org