Jibing-Li commented on code in PR #26435: URL: https://github.com/apache/doris/pull/26435#discussion_r1390553845
########## fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java: ########## @@ -85,46 +75,95 @@ public void doExecute() throws Exception { * 3. insert col stats and partition stats */ protected void doSample() throws Exception { - Pair<List<Long>, Long> pair = calcActualSampleTablets(); + LOG.info(String.format("Will do sample collection for column %s", col.getName())); + Pair<List<Long>, Long> pair = calcActualSampleTablets(isPartitionColumn()); + LOG.info(String.format("Number of tablets selected %d, rows in tablets %d", pair.first.size(), pair.second)); List<Long> tabletIds = pair.first; double scaleFactor = (double) tbl.getRowCount() / (double) pair.second; // might happen if row count in fe metadata hasn't been updated yet if (Double.isInfinite(scaleFactor) || Double.isNaN(scaleFactor)) { + LOG.warn("Scale factor is infinite or Nan, will set scale factor to 1."); scaleFactor = 1; tabletIds = Collections.emptyList(); + pair.second = tbl.getRowCount(); } String tabletStr = tabletIds.stream() .map(Object::toString) .collect(Collectors.joining(", ")); try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext(info.jobType.equals(JobType.SYSTEM))) { + // Get basic stats, including min and max. + ResultRow basicStats = collectBasicStat(r); + long rowCount = tbl.getRowCount(); + String min = Base64.getEncoder().encodeToString(basicStats.get(0).getBytes(StandardCharsets.UTF_8)); + String max = Base64.getEncoder().encodeToString(basicStats.get(1).getBytes(StandardCharsets.UTF_8)); + + boolean limitFlag = false; + long rowsToSample = pair.second; Map<String, String> params = new HashMap<>(); params.put("internalDB", FeConstants.INTERNAL_DB_NAME); params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); params.put("catalogId", String.valueOf(catalog.getId())); + params.put("catalogName", catalog.getName()); params.put("dbId", String.valueOf(db.getId())); params.put("tblId", String.valueOf(tbl.getId())); params.put("idxId", String.valueOf(info.indexId)); params.put("colId", String.valueOf(info.colName)); - params.put("dataSizeFunction", getDataSizeFunction(col)); + params.put("dataSizeFunction", getDataSizeFunction(col, false)); params.put("dbName", db.getFullName()); params.put("colName", info.colName); params.put("tblName", tbl.getName()); params.put("scaleFactor", String.valueOf(scaleFactor)); - params.put("tablets", tabletStr.isEmpty() ? "" : String.format("TABLET(%s)", tabletStr)); + params.put("sampleHints", tabletStr.isEmpty() ? "" : String.format("TABLET(%s)", tabletStr)); + params.put("ndvFunction", getNdvFunction(String.valueOf(rowCount))); + params.put("min", min); + params.put("max", max); + params.put("rowCount", String.valueOf(rowCount)); + params.put("type", col.getType().toString()); + params.put("limit", ""); + if (needLimit()) { + // If the tablets to be sampled are too large, use limit to control the rows to read, and re-calculate + // the scaleFactor. + limitFlag = true; + rowsToSample = Math.min(getSampleRows(), pair.second); + params.put("limit", "limit " + rowsToSample); + params.put("scaleFactor", String.valueOf(scaleFactor * (double) pair.second / rowsToSample)); + } StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - stmtExecutor = new StmtExecutor(r.connectContext, stringSubstitutor.replace(SAMPLE_COLUMN_SQL_TEMPLATE)); - // Scalar query only return one row - ColStatsData colStatsData = new ColStatsData(stmtExecutor.executeInternalQuery().get(0)); - job.appendBuf(this, Collections.singletonList(colStatsData)); + String sql; + // Distribution columns don't fit for DUJ1 estimator, use linear estimator. + if (isDistributionColumn()) { + params.put("min", StatisticsUtil.quote(min)); + params.put("max", StatisticsUtil.quote(max)); + sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE); + } else { + params.put("dataSizeFunction", getDataSizeFunction(col, true)); + sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); + } + LOG.info(String.format("Sample for column [%s]. Total rows [%s], rows to sample [%d], scale factor [%s], " Review Comment: How about keep this info level log? It's helpful to investigate if error happens. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org