morningman commented on code in PR #25175: URL: https://github.com/apache/doris/pull/25175#discussion_r1351812538
########## fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java: ########## @@ -142,6 +142,8 @@ default int getBaseColumnIdxByName(String colName) { Map<String, Set<String>> findReAnalyzeNeededPartitions(); + List<Long> getChunkSizes(); Review Comment: Add comment for these new interface ########## fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java: ########## @@ -58,14 +58,15 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "${idxId} AS idx_id, " + "'${colId}' AS col_id, " + "NULL AS part_id, " - + "${countExpr} AS row_count, " - + "NDV(`${colName}`) AS ndv, " - + "${nullCountExpr} AS null_count, " + + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, " + + "case when NDV(`${colName}`)/count('${colName}') < 0.3 then NDV(`${colName}`) " Review Comment: this factor should be defined some where ########## fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java: ########## @@ -396,4 +396,9 @@ public Map<String, Set<String>> findReAnalyzeNeededPartitions() { partitions.add("Dummy Partition"); return getBaseSchema().stream().collect(Collectors.toMap(Column::getName, k -> partitions)); } + + @Override + public List<Long> getChunkSizes() { + return null; Review Comment: Better not return "null" ########## fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java: ########## @@ -277,38 +277,53 @@ private Map<String, String> buildTableStatsParams(String partId) { commonParams.put("catalogName", catalog.getName()); commonParams.put("dbName", db.getFullName()); commonParams.put("tblName", tbl.getName()); - commonParams.put("countExpr", getCountExpression()); + commonParams.put("sampleExpr", getSampleExpression()); + commonParams.put("scaleFactor", getSampleScaleFactor()); if (col != null) { commonParams.put("type", col.getType().toString()); } commonParams.put("lastAnalyzeTimeInMs", String.valueOf(System.currentTimeMillis())); return commonParams; } - protected String getCountExpression() { - if (info.samplePercent > 0) { - return String.format("ROUND(COUNT(1) * 100 / %d)", info.samplePercent); - } else { - return "COUNT(1)"; + protected String getSampleExpression() { + if (tableSample == null) { + return ""; } - } - - protected String getNullCountExpression() { - if (info.samplePercent > 0) { - return String.format("ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 100 / %d)", - info.samplePercent); + if (tableSample.isPercent()) { + return String.format("TABLESAMPLE(%d PERCENT)", tableSample.getSampleValue()); } else { - return "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END)"; + return String.format("TABLESAMPLE(%d ROWS)", tableSample.getSampleValue()); } } - protected String getDataSizeFunction(Column column) { - String originFunction = super.getDataSizeFunction(column); - if (info.samplePercent > 0 && !isPartitionOnly) { - return String.format("ROUND((%s) * 100 / %d)", originFunction, info.samplePercent); + protected String getSampleScaleFactor() { Review Comment: Add some comment to explain the logic in code, better give some example -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org