This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 1a1649376f6 branch-2.1:[improvement](statistics)Eliminate null values 
while sample analyzing ndv. (#50574) (#51648)
1a1649376f6 is described below

commit 1a1649376f650c23a3c95d483c95507fe8aa3263
Author: James <lijib...@selectdb.com>
AuthorDate: Fri Jun 13 15:38:11 2025 +0800

    branch-2.1:[improvement](statistics)Eliminate null values while sample 
analyzing ndv. (#50574) (#51648)
    
    backport: https://github.com/apache/doris/pull/50574
---
 .../doris/nereids/stats/StatsCalculator.java       |  11 ++-
 .../apache/doris/statistics/BaseAnalysisTask.java  |  10 +-
 .../org/apache/doris/statistics/ColStatsData.java  |  14 ++-
 .../doris/statistics/BaseAnalysisTaskTest.java     |  46 ++++++++-
 .../apache/doris/statistics/ColStatsDataTest.java  |   2 +-
 .../doris/statistics/OlapAnalysisTaskTest.java     |   4 +-
 .../hive/test_hive_statistics_all_type_p0.groovy   |   4 +-
 .../suites/statistics/test_analyze_all_null.groovy | 108 +++++++++++++++++++++
 8 files changed, 177 insertions(+), 22 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index d2dc508e5b4..0a1ae85adfe 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -885,8 +885,15 @@ public class StatsCalculator extends 
DefaultPlanVisitor<Statistics, Void> {
                 return ColumnStatistic.UNKNOWN;
             }
         } else {
-            return 
Env.getCurrentEnv().getStatisticsCache().getColumnStatistics(
-                catalogId, dbId, table.getId(), idxId, colName);
+            ColumnStatistic columnStatistics = 
Env.getCurrentEnv().getStatisticsCache().getColumnStatistics(
+                    catalogId, dbId, table.getId(), idxId, colName);
+            if (!columnStatistics.isUnKnown
+                    && columnStatistics.ndv == 0
+                    && (columnStatistics.minExpr != null || 
columnStatistics.maxExpr != null)
+                    && columnStatistics.numNulls == columnStatistics.count) {
+                return ColumnStatistic.UNKNOWN;
+            }
+            return columnStatistics;
         }
     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index 1513dd2d782..926b7499f05 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -237,16 +237,14 @@ public abstract class BaseAnalysisTask {
     }
 
     protected String getNdvFunction(String totalRows) {
-        String sampleRows = "SUM(`t1`.`count`)";
-        String onceCount = "SUM(IF(`t1`.`count` = 1, 1, 0))";
-        String countDistinct = "COUNT(1)";
+        String n = "SUM(`t1`.`count`)"; // sample rows
+        String f1 = "SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not 
null, 1, 0))";
+        String d = "COUNT(`t1`.`column_key`)"; // sample ndv
         // DUJ1 estimator: n*d / (n - f1 + f1*n/N)
         // f1 is the count of element that appears only once in the sample.
         // 
(https://github.com/postgres/postgres/blob/master/src/backend/commands/analyze.c)
         // 
(http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.93.8637&rep=rep1&type=pdf)
-        // sample_row * count_distinct / ( sample_row - once_count + 
once_count * sample_row / total_row)
-        return MessageFormat.format("{0} * {1} / ({0} - {2} + {2} * {0} / 
{3})", sampleRows,
-                countDistinct, onceCount, totalRows);
+        return MessageFormat.format("{0} * {1} / ({0} - {2} + {2} * {0} / 
{3})", n, d, f1, totalRows);
     }
 
     // Max value is not accurate while sample, so set it to NULL to avoid 
optimizer generate bad plan.
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
index cc79f779f6e..5d4158db3fc 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
@@ -189,15 +189,19 @@ public class ColStatsData {
 
     public boolean isValid() {
         if (ndv > 10 * count) {
-            LOG.debug("Ndv {} is much larger than count {}", ndv, count);
+            String message = String.format("ColStatsData ndv too large. %s", 
toSQL(true));
+            LOG.warn(message);
             return false;
         }
-        if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit))) {
-            LOG.debug("Ndv is 0 but min or max exists");
+        if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit)) && nullCount != 
count) {
+            String message = String.format("ColStatsData ndv 0 but min/max is 
not null and nullCount != count. %s",
+                    toSQL(true));
+            LOG.warn(message);
             return false;
         }
-        if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) && 
(nullCount == 0 || count > nullCount * 10)) {
-            LOG.debug("count {} not 0, ndv is 0, min and max are all null, 
null count {} is too small", count, count);
+        if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) && 
(count > nullCount * 10)) {
+            LOG.warn("count {} not 0, ndv is 0, min and max are all null, null 
count {} is too small",
+                    count, nullCount);
             return false;
         }
         return true;
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
index dfa0d9fca9e..8405216a271 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
@@ -62,13 +62,12 @@ public class BaseAnalysisTaskTest {
         Assertions.assertEquals("NULL", maxFunction);
 
         String ndvFunction = 
olapAnalysisTask.getNdvFunction(String.valueOf(100));
-        Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) / 
(SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) "
-                + "+ SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 
100)", ndvFunction);
+        Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`) 
/ (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not 
null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 
0)) * SUM(`t1`.`count`) / 100)", ndvFunction);
         System.out.println(ndvFunction);
     }
 
     @Test
-    public void testInvalidColStats() {
+    public void testNdvTooLarge() {
         List<String> values = Lists.newArrayList();
         values.add("id");
         values.add("10000");
@@ -101,10 +100,49 @@ public class BaseAnalysisTaskTest {
         } catch (Exception e) {
             Assertions.assertEquals(e.getMessage(),
                     "ColStatsData is invalid, skip analyzing. "
-                        + 
"('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')");
+                            + 
"('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')");
             return;
         }
         Assertions.fail();
     }
 
+    @Test
+    public void testNdv0MinMaxExistsNullNotEqualCount() {
+        List<String> values = Lists.newArrayList();
+        values.add("id");
+        values.add("10000");
+        values.add("20000");
+        values.add("30000");
+        values.add("0");
+        values.add("col");
+        values.add(null);
+        values.add("500"); // count
+        values.add("0"); // ndv
+        values.add("300"); // null
+        values.add("min");
+        values.add("max");
+        values.add("400");
+        values.add("500");
+        ResultRow row = new ResultRow(values);
+        List<ResultRow> result = Lists.newArrayList();
+        result.add(row);
+
+        new MockUp<StmtExecutor>() {
+            @Mock
+            public List<ResultRow> executeInternalQuery() {
+                return result;
+            }
+        };
+        BaseAnalysisTask task = new OlapAnalysisTask();
+        task.info = new 
AnalysisInfoBuilder().setJobType(JobType.MANUAL).build();
+        try {
+            task.runQuery("test");
+        } catch (Exception e) {
+            Assertions.assertEquals(e.getMessage(),
+                    "ColStatsData is invalid, skip analyzing. "
+                            + 
"('id',10000,20000,30000,0,'col',null,500,0,300,'min','max',400,'500')");
+            return;
+        }
+        Assertions.fail();
+    }
 }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
index a7c84370153..b193edb2049 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
@@ -215,7 +215,7 @@ public class ColStatsDataTest {
         data = new ColStatsData(row);
         Assertions.assertFalse(data.isValid());
 
-        // Set max to null, min/max is not null
+        // Set max to null, min/max are all null
         values.set(11, null);
         row = new ResultRow(values);
         data = new ColStatsData(row);
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index 96d76a61908..3e0e309a488 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -370,7 +370,7 @@ public class OlapAnalysisTaskTest {
         Assertions.assertTrue(task.scanFullTable());
         Assertions.assertEquals("1.0", params.get("scaleFactor"));
         Assertions.assertEquals("", params.get("sampleHints"));
-        Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) / 
(SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 
1, 0)) * SUM(`t1`.`count`) / 10)", params.get("ndvFunction"));
+        Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`) 
/ (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not 
null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 
0)) * SUM(`t1`.`count`) / 10)", params.get("ndvFunction"));
         params.clear();
 
         task = new OlapAnalysisTask();
@@ -378,7 +378,7 @@ public class OlapAnalysisTaskTest {
         task.getSampleParams(params, 1000);
         Assertions.assertEquals("10.0", params.get("scaleFactor"));
         Assertions.assertEquals("TABLET(1, 2)", params.get("sampleHints"));
-        Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(1) / 
(SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 
1, 0)) * SUM(`t1`.`count`) / 1000)", params.get("ndvFunction"));
+        Assertions.assertEquals("SUM(`t1`.`count`) * COUNT(`t1`.`column_key`) 
/ (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not 
null, 1, 0)) + SUM(IF(`t1`.`count` = 1 and `t1`.`column_key` is not null, 1, 
0)) * SUM(`t1`.`count`) / 1000)", params.get("ndvFunction"));
         Assertions.assertEquals("SUM(t1.count) * 4", 
params.get("dataSizeFunction"));
         Assertions.assertEquals("`${colName}`", 
params.get("subStringColName"));
         params.clear();
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy
 
b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy
index 7d85f288bd7..b111247d9e7 100644
--- 
a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy
@@ -41,14 +41,14 @@ suite("test_hive_statistics_all_type_p0", 
"all_types,p0,external,hive,external_d
             result = sql """show column stats orc_all_types (int_col);"""
             assertEquals("int_col", result[0][0])
             assertEquals("3600.0", result[0][2])
-            assertEquals("3240.0", result[0][3])
+            assertEquals("3239.0", result[0][3])
             assertEquals("361.0", result[0][4])
             assertEquals("14400.0", result[0][5])
 
             result = sql """show column stats orc_all_types (string_col);"""
             assertEquals("string_col", result[0][0])
             assertEquals("3600.0", result[0][2])
-            assertEquals("3254.0", result[0][3])
+            assertEquals("3253.0", result[0][3])
             assertEquals("347.0", result[0][4])
             assertEquals("453634.0", result[0][5])
 
diff --git a/regression-test/suites/statistics/test_analyze_all_null.groovy 
b/regression-test/suites/statistics/test_analyze_all_null.groovy
new file mode 100644
index 00000000000..5defabb67eb
--- /dev/null
+++ b/regression-test/suites/statistics/test_analyze_all_null.groovy
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_analyze_all_null") {
+
+    def wait_row_count_reported = { db, table, row, column, expected ->
+        def result = sql """show frontends;"""
+        logger.info("show frontends result origin: " + result)
+        def host
+        def port
+        for (int i = 0; i < result.size(); i++) {
+            if (result[i][8] == "true") {
+                host = result[i][1]
+                port = result[i][4]
+            }
+        }
+        def tokens = context.config.jdbcUrl.split('/')
+        def url=tokens[0] + "//" + host + ":" + port
+        logger.info("Master url is " + url)
+        connect(context.config.jdbcUser, context.config.jdbcPassword, url) {
+            sql """use ${db}"""
+            result = sql """show frontends;"""
+            logger.info("show frontends result master: " + result)
+            for (int i = 0; i < 120; i++) {
+                Thread.sleep(5000)
+                result = sql """SHOW DATA FROM ${table};"""
+                logger.info("result " + result)
+                if (result[row][column] == expected) {
+                    return;
+                }
+            }
+            throw new Exception("Row count report timeout.")
+        }
+
+    }
+
+    sql """drop database if exists test_analyze_all_null"""
+    sql """create database test_analyze_all_null"""
+    sql """use test_analyze_all_null"""
+    sql """set global enable_auto_analyze=false"""
+
+    sql """CREATE TABLE allnull (
+            key1 int NULL,
+            value1 varchar(25) NULL
+        )ENGINE=OLAP
+        DUPLICATE KEY(`key1`)
+        COMMENT "OLAP"
+        DISTRIBUTED BY HASH(`key1`) BUCKETS 2
+        PROPERTIES (
+            "replication_num" = "1"
+        )
+    """
+    sql """insert into allnull select null, null from 
numbers("number"="10000000")"""
+    wait_row_count_reported("test_analyze_all_null", "allnull", 0, 4, 
"10000000")
+    sql """analyze table allnull with sample rows 4000000 with sync"""
+
+    def result = sql """show column stats allnull(key1)"""
+    assertEquals(1, result.size())
+    assertEquals("1.0E7", result[0][2])
+    assertEquals("0.0", result[0][3])
+    result = sql """show column stats allnull(value1)"""
+    assertEquals(1, result.size())
+    assertEquals("1.0E7", result[0][2])
+    assertEquals("0.0", result[0][3])
+
+    sql """CREATE TABLE invalidTest (
+            col1 int NULL,
+            col2 string NULL,
+            col3 string NULL
+        )ENGINE=OLAP
+        DUPLICATE KEY(`col1`)
+        COMMENT "OLAP"
+        DISTRIBUTED BY HASH(`col1`) BUCKETS 2
+        PROPERTIES (
+            "replication_num" = "1"
+        )
+    """
+    sql """insert into invalidTest values(1, "1", "1")"""
+
+    sql """alter table invalidTest modify column col1 set stats 
('row_count'='100', 'ndv'='100', 'num_nulls'='0.0', 'data_size'='3.2E8', 
'min_value'='1', 'max_value'='20000000');"""
+    sql """alter table invalidTest modify column col2 set stats 
('row_count'='100', 'ndv'='0', 'num_nulls'='0.0', 'data_size'='3.2E8', 
'min_value'='min', 'max_value'='max');"""
+    sql """alter table invalidTest modify column col3 set stats 
('row_count'='100', 'ndv'='0', 'num_nulls'='100', 'data_size'='3.2E8', 
'min_value'='min', 'max_value'='max');"""
+    result = sql """show column cached stats invalidTest"""
+    assertEquals(3, result.size())
+
+    explain {
+        sql("memo plan select * from invalidTest")
+        contains "col1#0 -> ndv=100.0000"
+        contains "col2#1 -> ndv=0.0000"
+        contains "col3#2 -> unknown(100.0)"
+    }
+
+    sql """drop database if exists test_analyze_all_null"""
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to