This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 1a1649376f6 branch-2.1:[improvement](statistics)Eliminate null values while sample analyzing ndv. (#50574) (#51648) 1a1649376f6 is described below commit 1a1649376f650c23a3c95d483c95507fe8aa3263 Author: James <lijib...@selectdb.com> AuthorDate: Fri Jun 13 15:38:11 2025 +0800 branch-2.1:[improvement](statistics)Eliminate null values while sample analyzing ndv. (#50574) (#51648) backport: https://github.com/apache/doris/pull/50574 --- .../doris/nereids/stats/StatsCalculator.java | 11 ++- .../apache/doris/statistics/BaseAnalysisTask.java | 10 +- .../org/apache/doris/statistics/ColStatsData.java | 14 ++- .../doris/statistics/BaseAnalysisTaskTest.java | 46 ++++++++- .../apache/doris/statistics/ColStatsDataTest.java | 2 +- .../doris/statistics/OlapAnalysisTaskTest.java | 4 +- .../hive/test_hive_statistics_all_type_p0.groovy | 4 +- .../suites/statistics/test_analyze_all_null.groovy | 108 +++++++++++++++++++++ 8 files changed, 177 insertions(+), 22 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index d2dc508e5b4..0a1ae85adfe 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -885,8 +885,15 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { return ColumnStatistic.UNKNOWN; } } else { - return Env.getCurrentEnv().getStatisticsCache().getColumnStatistics( - catalogId, dbId, table.getId(), idxId, colName); + ColumnStatistic columnStatistics = Env.getCurrentEnv().getStatisticsCache().getColumnStatistics( + catalogId, dbId, table.getId(), idxId, colName); + if (!columnStatistics.isUnKnown + && columnStatistics.ndv == 0 + && (columnStatistics.minExpr != null || columnStatistics.maxExpr != null) + && columnStatistics.numNulls == columnStatistics.count) { + return ColumnStatistic.UNKNOWN; + } + return columnStatistics; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 1513dd2d782..926b7499f05 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -237,16 +237,14 @@ public abstract class BaseAnalysisTask { } protected String getNdvFunction(String totalRows) { - String sampleRows = "SUM(`t1`.`count`)"; - String onceCount = "SUM(IF(`t1`.`count` = 1, 1, 0))"; - String countDistinct = "COUNT(1)"; + String n = "SUM(`t1`.`count`)"; // sample rows + String f1 = "SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0))"; + String d = "COUNT(`t1`.`column_key`)"; // sample ndv // DUJ1 estimator: n*d / (n - f1 + f1*n/N) // f1 is the count of element that appears only once in the sample. // (https://github.com/postgres/postgres/blob/master/src/backend/commands/analyze.c) // (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.93.8637&rep=rep1&type=pdf) - // sample_row * count_distinct / ( sample_row - once_count + once_count * sample_row / total_row) - return MessageFormat.format("{0} * {1} / ({0} - {2} + {2} * {0} / {3})", sampleRows, - countDistinct, onceCount, totalRows); + return MessageFormat.format("{0} * {1} / ({0} - {2} + {2} * {0} / {3})", n, d, f1, totalRows); } // Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan. diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java index cc79f779f6e..5d4158db3fc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java @@ -189,15 +189,19 @@ public class ColStatsData { public boolean isValid() { if (ndv > 10 * count) { - LOG.debug("Ndv {} is much larger than count {}", ndv, count); + String message = String.format("ColStatsData ndv too large. %s", toSQL(true)); + LOG.warn(message); return false; } - if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit))) { - LOG.debug("Ndv is 0 but min or max exists"); + if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit)) && nullCount != count) { + String message = String.format("ColStatsData ndv 0 but min/max is not null and nullCount != count. %s", + toSQL(true)); + LOG.warn(message); return false; } - if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) && (nullCount == 0 || count > nullCount * 10)) { - LOG.debug("count {} not 0, ndv is 0, min and max are all null, null count {} is too small", count, count); + if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) && (count > nullCount * 10)) { + LOG.warn("count {} not 0, ndv is 0, min and max are all null, null count {} is too small", + count, nullCount); return false; } return true; diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java index dfa0d9fca9e..8405216a271 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java @@ -62,13 +62,12 @@ public class BaseAnalysisTaskTest { Assertions.assertEquals("NULL", maxFunction); String ndvFunction = olapAnalysisTask.getNdvFunction(String.valueOf(100)); - Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) " - + "+ SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 100)", ndvFunction); + Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) * SUM(`t1`.`count`) / 100)", ndvFunction); System.out.println(ndvFunction); } @Test - public void testInvalidColStats() { + public void testNdvTooLarge() { List<String> values = Lists.newArrayList(); values.add("id"); values.add("10000"); @@ -101,10 +100,49 @@ public class BaseAnalysisTaskTest { } catch (Exception e) { Assertions.assertEquals(e.getMessage(), "ColStatsData is invalid, skip analyzing. " - + "('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')"); + + "('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')"); return; } Assertions.fail(); } + @Test + public void testNdv0MinMaxExistsNullNotEqualCount() { + List<String> values = Lists.newArrayList(); + values.add("id"); + values.add("10000"); + values.add("20000"); + values.add("30000"); + values.add("0"); + values.add("col"); + values.add(null); + values.add("500"); // count + values.add("0"); // ndv + values.add("300"); // null + values.add("min"); + values.add("max"); + values.add("400"); + values.add("500"); + ResultRow row = new ResultRow(values); + List<ResultRow> result = Lists.newArrayList(); + result.add(row); + + new MockUp<StmtExecutor>() { + @Mock + public List<ResultRow> executeInternalQuery() { + return result; + } + }; + BaseAnalysisTask task = new OlapAnalysisTask(); + task.info = new AnalysisInfoBuilder().setJobType(JobType.MANUAL).build(); + try { + task.runQuery("test"); + } catch (Exception e) { + Assertions.assertEquals(e.getMessage(), + "ColStatsData is invalid, skip analyzing. " + + "('id',10000,20000,30000,0,'col',null,500,0,300,'min','max',400,'500')"); + return; + } + Assertions.fail(); + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java index a7c84370153..b193edb2049 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java @@ -215,7 +215,7 @@ public class ColStatsDataTest { data = new ColStatsData(row); Assertions.assertFalse(data.isValid()); - // Set max to null, min/max is not null + // Set max to null, min/max are all null values.set(11, null); row = new ResultRow(values); data = new ColStatsData(row); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 96d76a61908..3e0e309a488 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -370,7 +370,7 @@ public class OlapAnalysisTaskTest { Assertions.assertTrue(task.scanFullTable()); Assertions.assertEquals("1.0", params.get("scaleFactor")); Assertions.assertEquals("", params.get("sampleHints")); - Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 10)", params.get("ndvFunction")); + Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) * SUM(`t1`.`count`) / 10)", params.get("ndvFunction")); params.clear(); task = new OlapAnalysisTask(); @@ -378,7 +378,7 @@ public class OlapAnalysisTaskTest { task.getSampleParams(params, 1000); Assertions.assertEquals("10.0", params.get("scaleFactor")); Assertions.assertEquals("TABLET(1, 2)", params.get("sampleHints")); - Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 1000)", params.get("ndvFunction")); + Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 0)) * SUM(`t1`.`count`) / 1000)", params.get("ndvFunction")); Assertions.assertEquals("SUM(t1.count) * 4", params.get("dataSizeFunction")); Assertions.assertEquals("`${colName}`", params.get("subStringColName")); params.clear(); diff --git a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy index 7d85f288bd7..b111247d9e7 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy @@ -41,14 +41,14 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_d result = sql """show column stats orc_all_types (int_col);""" assertEquals("int_col", result[0][0]) assertEquals("3600.0", result[0][2]) - assertEquals("3240.0", result[0][3]) + assertEquals("3239.0", result[0][3]) assertEquals("361.0", result[0][4]) assertEquals("14400.0", result[0][5]) result = sql """show column stats orc_all_types (string_col);""" assertEquals("string_col", result[0][0]) assertEquals("3600.0", result[0][2]) - assertEquals("3254.0", result[0][3]) + assertEquals("3253.0", result[0][3]) assertEquals("347.0", result[0][4]) assertEquals("453634.0", result[0][5]) diff --git a/regression-test/suites/statistics/test_analyze_all_null.groovy b/regression-test/suites/statistics/test_analyze_all_null.groovy new file mode 100644 index 00000000000..5defabb67eb --- /dev/null +++ b/regression-test/suites/statistics/test_analyze_all_null.groovy @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_analyze_all_null") { + + def wait_row_count_reported = { db, table, row, column, expected -> + def result = sql """show frontends;""" + logger.info("show frontends result origin: " + result) + def host + def port + for (int i = 0; i < result.size(); i++) { + if (result[i][8] == "true") { + host = result[i][1] + port = result[i][4] + } + } + def tokens = context.config.jdbcUrl.split('/') + def url=tokens[0] + "//" + host + ":" + port + logger.info("Master url is " + url) + connect(context.config.jdbcUser, context.config.jdbcPassword, url) { + sql """use ${db}""" + result = sql """show frontends;""" + logger.info("show frontends result master: " + result) + for (int i = 0; i < 120; i++) { + Thread.sleep(5000) + result = sql """SHOW DATA FROM ${table};""" + logger.info("result " + result) + if (result[row][column] == expected) { + return; + } + } + throw new Exception("Row count report timeout.") + } + + } + + sql """drop database if exists test_analyze_all_null""" + sql """create database test_analyze_all_null""" + sql """use test_analyze_all_null""" + sql """set global enable_auto_analyze=false""" + + sql """CREATE TABLE allnull ( + key1 int NULL, + value1 varchar(25) NULL + )ENGINE=OLAP + DUPLICATE KEY(`key1`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`key1`) BUCKETS 2 + PROPERTIES ( + "replication_num" = "1" + ) + """ + sql """insert into allnull select null, null from numbers("number"="10000000")""" + wait_row_count_reported("test_analyze_all_null", "allnull", 0, 4, "10000000") + sql """analyze table allnull with sample rows 4000000 with sync""" + + def result = sql """show column stats allnull(key1)""" + assertEquals(1, result.size()) + assertEquals("1.0E7", result[0][2]) + assertEquals("0.0", result[0][3]) + result = sql """show column stats allnull(value1)""" + assertEquals(1, result.size()) + assertEquals("1.0E7", result[0][2]) + assertEquals("0.0", result[0][3]) + + sql """CREATE TABLE invalidTest ( + col1 int NULL, + col2 string NULL, + col3 string NULL + )ENGINE=OLAP + DUPLICATE KEY(`col1`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`col1`) BUCKETS 2 + PROPERTIES ( + "replication_num" = "1" + ) + """ + sql """insert into invalidTest values(1, "1", "1")""" + + sql """alter table invalidTest modify column col1 set stats ('row_count'='100', 'ndv'='100', 'num_nulls'='0.0', 'data_size'='3.2E8', 'min_value'='1', 'max_value'='20000000');""" + sql """alter table invalidTest modify column col2 set stats ('row_count'='100', 'ndv'='0', 'num_nulls'='0.0', 'data_size'='3.2E8', 'min_value'='min', 'max_value'='max');""" + sql """alter table invalidTest modify column col3 set stats ('row_count'='100', 'ndv'='0', 'num_nulls'='100', 'data_size'='3.2E8', 'min_value'='min', 'max_value'='max');""" + result = sql """show column cached stats invalidTest""" + assertEquals(3, result.size()) + + explain { + sql("memo plan select * from invalidTest") + contains "col1#0 -> ndv=100.0000" + contains "col2#1 -> ndv=0.0000" + contains "col3#2 -> unknown(100.0)" + } + + sql """drop database if exists test_analyze_all_null""" +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org