morrySnow commented on code in PR #26435: URL: https://github.com/apache/doris/pull/26435#discussion_r1389363251
########## fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java: ########## @@ -66,10 +62,48 @@ public abstract class BaseAnalysisTask { + " CAST(MAX(${colName}) AS STRING) AS max, " + " ${dataSizeFunction} AS data_size, " + " NOW() AS update_time " - + " FROM `${dbName}`.`${tblName}`"; + + " FROM `${catalogName}`.`${dbName}`.`${tblName}`"; - protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = - " SELECT " + protected static final String LINEAR_ANALYZE_TEMPLATE = " SELECT " + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + + "${catalogId} AS catalog_id, " + + "${dbId} AS db_id, " + + "${tblId} AS tbl_id, " + + "${idxId} AS idx_id, " + + "'${colId}' AS col_id, " + + "NULL AS part_id, " + + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, " + + "ROUND(NDV(`${colName}`) * ${scaleFactor}) as ndv, " + + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, " + + "${min} AS min, " + + "${max} AS max, " + + "${dataSizeFunction} * ${scaleFactor} AS data_size, " + + "NOW() " + + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints} ${limit}"; + + protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT " Review Comment: ditto ########## fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java: ########## @@ -120,34 +108,64 @@ private boolean isPartitionColumn() { return table.getPartitionColumns().stream().anyMatch(c -> c.getName().equals(col.getName())); } + // Get ordinary column stats. Ordinary column means not partition column. private void getOrdinaryColumnStats() throws Exception { - // An example sql for a column stats: - // INSERT INTO __internal_schema.column_statistics - // SELECT CONCAT(13055, '-', -1, '-', 'r_regionkey') AS id, - // 13002 AS catalog_id, - // 13038 AS db_id, - // 13055 AS tbl_id, - // -1 AS idx_id, - // 'r_regionkey' AS col_id, - // 'NULL' AS part_id, - // COUNT(1) AS row_count, - // NDV(`r_regionkey`) AS ndv, - // SUM(CASE WHEN `r_regionkey` IS NULL THEN 1 ELSE 0 END) AS null_count, - // MIN(`r_regionkey`) AS min, - // MAX(`r_regionkey`) AS max, - // 0 AS data_size, - // NOW() FROM `hive`.`tpch100`.`region` StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_TABLE_TEMPLATE); Map<String, String> params = buildStatsParams("NULL"); - params.put("dataSizeFunction", getDataSizeFunction(col)); - params.put("minFunction", getMinFunction()); - params.put("maxFunction", getMaxFunction()); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); + params.put("min", getMinFunction()); + params.put("max", getMaxFunction()); + params.put("dataSizeFunction", getDataSizeFunction(col, false)); + Pair<Double, Long> sampleInfo = getSampleInfo(); + params.put("scaleFactor", String.valueOf(sampleInfo.first)); + StringSubstitutor stringSubstitutor; + if (tableSample == null) { + // Do full analyze + LOG.info(String.format("Will do full collection for column %s", col.getName())); + sb.append(COLLECT_COL_STATISTICS); + } else { + // Do sample analyze + LOG.info(String.format("Will do sample collection for column %s", col.getName())); Review Comment: use log format, not use string format ########## fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java: ########## @@ -66,10 +62,48 @@ public abstract class BaseAnalysisTask { + " CAST(MAX(${colName}) AS STRING) AS max, " + " ${dataSizeFunction} AS data_size, " + " NOW() AS update_time " - + " FROM `${dbName}`.`${tblName}`"; + + " FROM `${catalogName}`.`${dbName}`.`${tblName}`"; - protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = - " SELECT " + protected static final String LINEAR_ANALYZE_TEMPLATE = " SELECT " + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " Review Comment: add back quote to all alias to avoid add new keyword in lexical analyzer to break this sql ########## fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java: ########## @@ -120,34 +108,64 @@ private boolean isPartitionColumn() { return table.getPartitionColumns().stream().anyMatch(c -> c.getName().equals(col.getName())); } + // Get ordinary column stats. Ordinary column means not partition column. private void getOrdinaryColumnStats() throws Exception { - // An example sql for a column stats: - // INSERT INTO __internal_schema.column_statistics - // SELECT CONCAT(13055, '-', -1, '-', 'r_regionkey') AS id, - // 13002 AS catalog_id, - // 13038 AS db_id, - // 13055 AS tbl_id, - // -1 AS idx_id, - // 'r_regionkey' AS col_id, - // 'NULL' AS part_id, - // COUNT(1) AS row_count, - // NDV(`r_regionkey`) AS ndv, - // SUM(CASE WHEN `r_regionkey` IS NULL THEN 1 ELSE 0 END) AS null_count, - // MIN(`r_regionkey`) AS min, - // MAX(`r_regionkey`) AS max, - // 0 AS data_size, - // NOW() FROM `hive`.`tpch100`.`region` StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_TABLE_TEMPLATE); Map<String, String> params = buildStatsParams("NULL"); - params.put("dataSizeFunction", getDataSizeFunction(col)); - params.put("minFunction", getMinFunction()); - params.put("maxFunction", getMaxFunction()); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); + params.put("min", getMinFunction()); + params.put("max", getMaxFunction()); + params.put("dataSizeFunction", getDataSizeFunction(col, false)); + Pair<Double, Long> sampleInfo = getSampleInfo(); + params.put("scaleFactor", String.valueOf(sampleInfo.first)); + StringSubstitutor stringSubstitutor; + if (tableSample == null) { + // Do full analyze + LOG.info(String.format("Will do full collection for column %s", col.getName())); + sb.append(COLLECT_COL_STATISTICS); + } else { + // Do sample analyze + LOG.info(String.format("Will do sample collection for column %s", col.getName())); + boolean limitFlag = false; + boolean bucketFlag = false; + Set<String> bucketColumns = ((HMSExternalTable) tbl).getRemoteTable().getSd().getBucketCols() + .stream().map(String::toLowerCase).collect(Collectors.toSet()); + // If sample size is too large, use limit to control the sample size. + if (needLimit(sampleInfo.second, sampleInfo.first)) { + limitFlag = true; + long columnSize = 0; + for (Column column : table.getFullSchema()) { + columnSize += column.getDataType().getSlotSize(); + } + double targetRows = (double) sampleInfo.second / columnSize; + // Estimate the new scaleFactor based on the schema. + if (targetRows > StatisticsUtil.getHugeTableSampleRows()) { + params.put("limit", "limit " + StatisticsUtil.getHugeTableSampleRows()); + params.put("scaleFactor", + String.valueOf(sampleInfo.first * targetRows / StatisticsUtil.getHugeTableSampleRows())); + } + } + // Distribution columns don't fit for DUJ1 estimator, use linear estimator. + if (bucketColumns.contains(col.getName().toLowerCase(Locale.ROOT))) { + bucketFlag = true; + sb.append(LINEAR_ANALYZE_TEMPLATE); + params.put("rowCount", "ROUND(count(1) * ${scaleFactor})"); + } else { + sb.append(DUJ1_ANALYZE_TEMPLATE); + params.put("dataSizeFunction", getDataSizeFunction(col, true)); + params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})")); + params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})"); + } + LOG.info(String.format("Sample for column [%s]. Scale factor [%s], " Review Comment: use log format, not use string format ########## fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java: ########## @@ -120,34 +108,64 @@ private boolean isPartitionColumn() { return table.getPartitionColumns().stream().anyMatch(c -> c.getName().equals(col.getName())); } + // Get ordinary column stats. Ordinary column means not partition column. private void getOrdinaryColumnStats() throws Exception { - // An example sql for a column stats: - // INSERT INTO __internal_schema.column_statistics - // SELECT CONCAT(13055, '-', -1, '-', 'r_regionkey') AS id, - // 13002 AS catalog_id, - // 13038 AS db_id, - // 13055 AS tbl_id, - // -1 AS idx_id, - // 'r_regionkey' AS col_id, - // 'NULL' AS part_id, - // COUNT(1) AS row_count, - // NDV(`r_regionkey`) AS ndv, - // SUM(CASE WHEN `r_regionkey` IS NULL THEN 1 ELSE 0 END) AS null_count, - // MIN(`r_regionkey`) AS min, - // MAX(`r_regionkey`) AS max, - // 0 AS data_size, - // NOW() FROM `hive`.`tpch100`.`region` StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_TABLE_TEMPLATE); Map<String, String> params = buildStatsParams("NULL"); - params.put("dataSizeFunction", getDataSizeFunction(col)); - params.put("minFunction", getMinFunction()); - params.put("maxFunction", getMaxFunction()); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); + params.put("min", getMinFunction()); + params.put("max", getMaxFunction()); + params.put("dataSizeFunction", getDataSizeFunction(col, false)); + Pair<Double, Long> sampleInfo = getSampleInfo(); + params.put("scaleFactor", String.valueOf(sampleInfo.first)); + StringSubstitutor stringSubstitutor; + if (tableSample == null) { + // Do full analyze + LOG.info(String.format("Will do full collection for column %s", col.getName())); Review Comment: use log format, not use string format -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org