This is an automated email from the ASF dual-hosted git repository.

lijibing pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 70b5a83d65c [fix](statistics)Skip analyze if the collected info is 
invalid. (#42028) (#42086)
70b5a83d65c is described below

commit 70b5a83d65c6cfb99d16ce69f485168497bfc12e
Author: Jibing-Li <64681310+jibing...@users.noreply.github.com>
AuthorDate: Fri Oct 18 22:24:50 2024 +0800

    [fix](statistics)Skip analyze if the collected info is invalid. (#42028) 
(#42086)
    
    backport: https://github.com/apache/doris/pull/42028
---
 .../apache/doris/statistics/BaseAnalysisTask.java  |  5 ++
 .../org/apache/doris/statistics/ColStatsData.java  | 21 ++++++
 .../doris/statistics/BaseAnalysisTaskTest.java     | 45 +++++++++++
 .../apache/doris/statistics/ColStatsDataTest.java  | 87 ++++++++++++++++++++++
 4 files changed, 158 insertions(+)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index 6ab65677e7a..f6553b7c485 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -293,6 +293,11 @@ public abstract class BaseAnalysisTask {
         try (AutoCloseConnectContext a  = 
StatisticsUtil.buildConnectContext()) {
             stmtExecutor = new StmtExecutor(a.connectContext, sql);
             ColStatsData colStatsData = new 
ColStatsData(stmtExecutor.executeInternalQuery().get(0));
+            if (!colStatsData.isValid()) {
+                String message = String.format("ColStatsData is invalid, skip 
analyzing. %s", colStatsData.toSQL(true));
+                LOG.warn(message);
+                throw new RuntimeException(message);
+            }
             // Update index row count after analyze.
             if (this instanceof OlapAnalysisTask) {
                 AnalysisInfo jobInfo = 
Env.getCurrentEnv().getAnalysisManager().findJobInfo(job.getJobInfo().jobId);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
index 7cf75462fee..bb826399458 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
@@ -186,4 +186,25 @@ public class ColStatsData {
             return ColumnStatistic.UNKNOWN;
         }
     }
+
+    public boolean isNull(String value) {
+        // Checking "NULL" as null is a historical bug which treat literal 
value "NULL" as null. Will fix it soon.
+        return value == null || value.equalsIgnoreCase("NULL");
+    }
+
+    public boolean isValid() {
+        if (ndv > 10 * count) {
+            LOG.debug("Ndv {} is much larger than count {}", ndv, count);
+            return false;
+        }
+        if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit))) {
+            LOG.debug("Ndv is 0 but min or max exists");
+            return false;
+        }
+        if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) && 
(nullCount == 0 || count > nullCount * 10)) {
+            LOG.debug("count {} not 0, ndv is 0, min and max are all null, 
null count {} is too small", count, count);
+            return false;
+        }
+        return true;
+    }
 }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
index 86ccfff26e0..de2cf522366 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
@@ -20,10 +20,16 @@ package org.apache.doris.statistics;
 import org.apache.doris.analysis.TableSample;
 import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.PrimitiveType;
+import org.apache.doris.qe.StmtExecutor;
 
+import com.google.common.collect.Lists;
+import mockit.Mock;
+import mockit.MockUp;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 
+import java.util.List;
+
 public class BaseAnalysisTaskTest {
 
     @Test
@@ -60,4 +66,43 @@ public class BaseAnalysisTaskTest {
         System.out.println(ndvFunction);
     }
 
+    @Test
+    public void testInvalidColStats() {
+        List<String> values = Lists.newArrayList();
+        values.add("id");
+        values.add("10000");
+        values.add("20000");
+        values.add("30000");
+        values.add("0");
+        values.add("col");
+        values.add(null);
+        values.add("100"); // count
+        values.add("1100"); // ndv
+        values.add("300"); // null
+        values.add("min");
+        values.add("max");
+        values.add("400");
+        values.add("500");
+        ResultRow row = new ResultRow(values);
+        List<ResultRow> result = Lists.newArrayList();
+        result.add(row);
+
+        new MockUp<StmtExecutor>() {
+            @Mock
+            public List<ResultRow> executeInternalQuery() {
+                return result;
+            }
+        };
+        BaseAnalysisTask task = new OlapAnalysisTask();
+        try {
+            task.runQuery("test");
+        } catch (Exception e) {
+            Assertions.assertEquals(e.getMessage(),
+                    "ColStatsData is invalid, skip analyzing. "
+                        + 
"('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')");
+            return;
+        }
+        Assertions.fail();
+    }
+
 }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
index 8743105a644..a7c84370153 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
@@ -164,4 +164,91 @@ public class ColStatsDataTest {
         Assertions.assertEquals(400, columnStatistic.dataSize);
         Assertions.assertEquals("500", columnStatistic.updatedTime);
     }
+
+    @Test
+    public void testIsNull() {
+        ColStatsData stats = new ColStatsData();
+        Assertions.assertTrue(stats.isNull(null));
+        Assertions.assertTrue(stats.isNull("null"));
+        Assertions.assertTrue(stats.isNull("NuLl"));
+        Assertions.assertFalse(stats.isNull(""));
+        Assertions.assertFalse(stats.isNull(" "));
+        Assertions.assertFalse(stats.isNull("123"));
+    }
+
+    @Test
+    public void testIsValid() {
+        List<String> values = Lists.newArrayList();
+        values.add("id");
+        values.add("10000");
+        values.add("20000");
+        values.add("30000");
+        values.add("0");
+        values.add("col");
+        values.add(null);
+        values.add("100"); // count
+        values.add("1100"); // ndv
+        values.add("300"); // null
+        values.add("min");
+        values.add("max");
+        values.add("400");
+        values.add("500");
+        ResultRow row = new ResultRow(values);
+        ColStatsData data = new ColStatsData(row);
+        Assertions.assertFalse(data.isValid());
+
+        // Set count = 200
+        values.set(7, "200");
+        row = new ResultRow(values);
+        data = new ColStatsData(row);
+        Assertions.assertTrue(data.isValid());
+
+        // Set ndv = 0, min/max is not null
+        values.set(8, "0");
+        row = new ResultRow(values);
+        data = new ColStatsData(row);
+        Assertions.assertFalse(data.isValid());
+
+        // Set min to null, min/max is not null
+        values.set(10, null);
+        row = new ResultRow(values);
+        data = new ColStatsData(row);
+        Assertions.assertFalse(data.isValid());
+
+        // Set max to null, min/max is not null
+        values.set(11, null);
+        row = new ResultRow(values);
+        data = new ColStatsData(row);
+        Assertions.assertTrue(data.isValid());
+
+        // Set min to not null, min/max is not null
+        values.set(10, "min");
+        row = new ResultRow(values);
+        data = new ColStatsData(row);
+        Assertions.assertFalse(data.isValid());
+
+        // Set min and max to null, nullNum = 0
+        values.set(9, "0");
+        values.set(10, "nuLl");
+        values.set(11, null);
+        row = new ResultRow(values);
+        data = new ColStatsData(row);
+        Assertions.assertFalse(data.isValid());
+
+        // nullNum = 19, count = 200
+        values.set(9, "19");
+        row = new ResultRow(values);
+        data = new ColStatsData(row);
+        Assertions.assertFalse(data.isValid());
+
+        // nullNum = 21, count = 200, so count < nullNum * 10
+        values.set(9, "21");
+        row = new ResultRow(values);
+        data = new ColStatsData(row);
+        Assertions.assertTrue(data.isValid());
+
+        // Empty table stats is valid.
+        data = new ColStatsData();
+        Assertions.assertTrue(data.isValid());
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to