Re: [PR] [Improvement](statistics)Improve stats sample strategy. [doris]

via GitHub Sun, 12 Nov 2023 20:02:52 -0800


morrySnow commented on code in PR #26435:
URL: https://github.com/apache/doris/pull/26435#discussion_r1389363251



##########
fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java:
##########
@@ -66,10 +62,48 @@ public abstract class BaseAnalysisTask {
             + "         CAST(MAX(${colName}) AS STRING) AS max, "
             + "         ${dataSizeFunction} AS data_size, "
             + "         NOW() AS update_time "
-            + " FROM `${dbName}`.`${tblName}`";
+            + " FROM `${catalogName}`.`${dbName}`.`${tblName}`";
 
-    protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE =
-            " SELECT "
+    protected static final String LINEAR_ANALYZE_TEMPLATE = " SELECT "
+            + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "
+            + "${catalogId} AS catalog_id, "
+            + "${dbId} AS db_id, "
+            + "${tblId} AS tbl_id, "
+            + "${idxId} AS idx_id, "
+            + "'${colId}' AS col_id, "
+            + "NULL AS part_id, "
+            + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+            + "ROUND(NDV(`${colName}`) * ${scaleFactor})  as ndv, "
+            + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 
${scaleFactor}) AS null_count, "
+            + "${min} AS min, "
+            + "${max} AS max, "
+            + "${dataSizeFunction} * ${scaleFactor} AS data_size, "
+            + "NOW() "
+            + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints} 
${limit}";
+
+    protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT "

Review Comment:
   ditto



##########
fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java:
##########
@@ -120,34 +108,64 @@ private boolean isPartitionColumn() {
         return table.getPartitionColumns().stream().anyMatch(c -> 
c.getName().equals(col.getName()));
     }
 
+    // Get ordinary column stats. Ordinary column means not partition column.
     private void getOrdinaryColumnStats() throws Exception {
-        // An example sql for a column stats:
-        // INSERT INTO __internal_schema.column_statistics
-        //   SELECT CONCAT(13055, '-', -1, '-', 'r_regionkey') AS id,
-        //   13002 AS catalog_id,
-        //   13038 AS db_id,
-        //   13055 AS tbl_id,
-        //   -1 AS idx_id,
-        //   'r_regionkey' AS col_id,
-        //   'NULL' AS part_id,
-        //   COUNT(1) AS row_count,
-        //   NDV(`r_regionkey`) AS ndv,
-        //   SUM(CASE WHEN `r_regionkey` IS NULL THEN 1 ELSE 0 END) AS 
null_count,
-        //   MIN(`r_regionkey`) AS min,
-        //   MAX(`r_regionkey`) AS max,
-        //   0 AS data_size,
-        //   NOW() FROM `hive`.`tpch100`.`region`
         StringBuilder sb = new StringBuilder();
-        sb.append(ANALYZE_TABLE_TEMPLATE);
         Map<String, String> params = buildStatsParams("NULL");
-        params.put("dataSizeFunction", getDataSizeFunction(col));
-        params.put("minFunction", getMinFunction());
-        params.put("maxFunction", getMaxFunction());
-        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
+        params.put("min", getMinFunction());
+        params.put("max", getMaxFunction());
+        params.put("dataSizeFunction", getDataSizeFunction(col, false));
+        Pair<Double, Long> sampleInfo = getSampleInfo();
+        params.put("scaleFactor", String.valueOf(sampleInfo.first));
+        StringSubstitutor stringSubstitutor;
+        if (tableSample == null) {
+            // Do full analyze
+            LOG.info(String.format("Will do full collection for column %s", 
col.getName()));
+            sb.append(COLLECT_COL_STATISTICS);
+        } else {
+            // Do sample analyze
+            LOG.info(String.format("Will do sample collection for column %s", 
col.getName()));

Review Comment:
   use log format, not use string format



##########
fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java:
##########
@@ -66,10 +62,48 @@ public abstract class BaseAnalysisTask {
             + "         CAST(MAX(${colName}) AS STRING) AS max, "
             + "         ${dataSizeFunction} AS data_size, "
             + "         NOW() AS update_time "
-            + " FROM `${dbName}`.`${tblName}`";
+            + " FROM `${catalogName}`.`${dbName}`.`${tblName}`";
 
-    protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE =
-            " SELECT "
+    protected static final String LINEAR_ANALYZE_TEMPLATE = " SELECT "
+            + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "

Review Comment:
   add back quote to all alias to avoid add new keyword in lexical analyzer to 
break this sql



##########
fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java:
##########
@@ -120,34 +108,64 @@ private boolean isPartitionColumn() {
         return table.getPartitionColumns().stream().anyMatch(c -> 
c.getName().equals(col.getName()));
     }
 
+    // Get ordinary column stats. Ordinary column means not partition column.
     private void getOrdinaryColumnStats() throws Exception {
-        // An example sql for a column stats:
-        // INSERT INTO __internal_schema.column_statistics
-        //   SELECT CONCAT(13055, '-', -1, '-', 'r_regionkey') AS id,
-        //   13002 AS catalog_id,
-        //   13038 AS db_id,
-        //   13055 AS tbl_id,
-        //   -1 AS idx_id,
-        //   'r_regionkey' AS col_id,
-        //   'NULL' AS part_id,
-        //   COUNT(1) AS row_count,
-        //   NDV(`r_regionkey`) AS ndv,
-        //   SUM(CASE WHEN `r_regionkey` IS NULL THEN 1 ELSE 0 END) AS 
null_count,
-        //   MIN(`r_regionkey`) AS min,
-        //   MAX(`r_regionkey`) AS max,
-        //   0 AS data_size,
-        //   NOW() FROM `hive`.`tpch100`.`region`
         StringBuilder sb = new StringBuilder();
-        sb.append(ANALYZE_TABLE_TEMPLATE);
         Map<String, String> params = buildStatsParams("NULL");
-        params.put("dataSizeFunction", getDataSizeFunction(col));
-        params.put("minFunction", getMinFunction());
-        params.put("maxFunction", getMaxFunction());
-        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
+        params.put("min", getMinFunction());
+        params.put("max", getMaxFunction());
+        params.put("dataSizeFunction", getDataSizeFunction(col, false));
+        Pair<Double, Long> sampleInfo = getSampleInfo();
+        params.put("scaleFactor", String.valueOf(sampleInfo.first));
+        StringSubstitutor stringSubstitutor;
+        if (tableSample == null) {
+            // Do full analyze
+            LOG.info(String.format("Will do full collection for column %s", 
col.getName()));
+            sb.append(COLLECT_COL_STATISTICS);
+        } else {
+            // Do sample analyze
+            LOG.info(String.format("Will do sample collection for column %s", 
col.getName()));
+            boolean limitFlag = false;
+            boolean bucketFlag = false;
+            Set<String> bucketColumns = ((HMSExternalTable) 
tbl).getRemoteTable().getSd().getBucketCols()
+                    
.stream().map(String::toLowerCase).collect(Collectors.toSet());
+            // If sample size is too large, use limit to control the sample 
size.
+            if (needLimit(sampleInfo.second, sampleInfo.first)) {
+                limitFlag = true;
+                long columnSize = 0;
+                for (Column column : table.getFullSchema()) {
+                    columnSize += column.getDataType().getSlotSize();
+                }
+                double targetRows = (double) sampleInfo.second / columnSize;
+                // Estimate the new scaleFactor based on the schema.
+                if (targetRows > StatisticsUtil.getHugeTableSampleRows()) {
+                    params.put("limit", "limit " + 
StatisticsUtil.getHugeTableSampleRows());
+                    params.put("scaleFactor",
+                            String.valueOf(sampleInfo.first * targetRows / 
StatisticsUtil.getHugeTableSampleRows()));
+                }
+            }
+            // Distribution columns don't fit for DUJ1 estimator, use linear 
estimator.
+            if 
(bucketColumns.contains(col.getName().toLowerCase(Locale.ROOT))) {
+                bucketFlag = true;
+                sb.append(LINEAR_ANALYZE_TEMPLATE);
+                params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
+            } else {
+                sb.append(DUJ1_ANALYZE_TEMPLATE);
+                params.put("dataSizeFunction", getDataSizeFunction(col, true));
+                params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) 
* ${scaleFactor})"));
+                params.put("rowCount", "ROUND(SUM(t1.count) * 
${scaleFactor})");
+            }
+            LOG.info(String.format("Sample for column [%s]. Scale factor [%s], 
"

Review Comment:
   use log format, not use string format



##########
fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java:
##########
@@ -120,34 +108,64 @@ private boolean isPartitionColumn() {
         return table.getPartitionColumns().stream().anyMatch(c -> 
c.getName().equals(col.getName()));
     }
 
+    // Get ordinary column stats. Ordinary column means not partition column.
     private void getOrdinaryColumnStats() throws Exception {
-        // An example sql for a column stats:
-        // INSERT INTO __internal_schema.column_statistics
-        //   SELECT CONCAT(13055, '-', -1, '-', 'r_regionkey') AS id,
-        //   13002 AS catalog_id,
-        //   13038 AS db_id,
-        //   13055 AS tbl_id,
-        //   -1 AS idx_id,
-        //   'r_regionkey' AS col_id,
-        //   'NULL' AS part_id,
-        //   COUNT(1) AS row_count,
-        //   NDV(`r_regionkey`) AS ndv,
-        //   SUM(CASE WHEN `r_regionkey` IS NULL THEN 1 ELSE 0 END) AS 
null_count,
-        //   MIN(`r_regionkey`) AS min,
-        //   MAX(`r_regionkey`) AS max,
-        //   0 AS data_size,
-        //   NOW() FROM `hive`.`tpch100`.`region`
         StringBuilder sb = new StringBuilder();
-        sb.append(ANALYZE_TABLE_TEMPLATE);
         Map<String, String> params = buildStatsParams("NULL");
-        params.put("dataSizeFunction", getDataSizeFunction(col));
-        params.put("minFunction", getMinFunction());
-        params.put("maxFunction", getMaxFunction());
-        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
+        params.put("min", getMinFunction());
+        params.put("max", getMaxFunction());
+        params.put("dataSizeFunction", getDataSizeFunction(col, false));
+        Pair<Double, Long> sampleInfo = getSampleInfo();
+        params.put("scaleFactor", String.valueOf(sampleInfo.first));
+        StringSubstitutor stringSubstitutor;
+        if (tableSample == null) {
+            // Do full analyze
+            LOG.info(String.format("Will do full collection for column %s", 
col.getName()));

Review Comment:
   use log format, not use string format



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Re: [PR] [Improvement](statistics)Improve stats sample strategy. [doris]

Reply via email to