Jibing-Li commented on code in PR #21207: URL: https://github.com/apache/doris/pull/21207#discussion_r1250368577
########## fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java: ########## @@ -461,4 +478,102 @@ public static int getTableHealth(long totalRows, long updatedRows) { return (int) (healthCoefficient * 100.0); } } + + /** + * Estimate hive table row count. + * First get it from remote table parameters. If not found, estimate it : totalSize/estimatedRowSize + * @param table Hive HMSExternalTable to estimate row count. + * @return estimated row count + */ + public static long getHiveRowCount(HMSExternalTable table) { + Map<String, String> parameters = table.getRemoteTable().getParameters(); + if (parameters == null) { + return -1; + } + // Table parameters contains row count, simply get and return it. + if (parameters.containsKey(NUM_ROWS)) { + return Long.parseLong(parameters.get(NUM_ROWS)); + } + if (!parameters.containsKey(TOTAL_SIZE)) { + return -1; + } + // Table parameters doesn't contain row count but contain total size. Estimate row count : totalSize/rowSize + long totalSize = Long.parseLong(parameters.get(TOTAL_SIZE)); + long estimatedRowSize = 0; + for (Column column : table.getFullSchema()) { + estimatedRowSize += column.getDataType().getSlotSize(); + } + if (estimatedRowSize == 0) { + return 1; + } + return totalSize / estimatedRowSize; + } + + /** + * Estimate iceberg table row count. + * Get the row count by adding all task file recordCount. + * @param table Iceberg HMSExternalTable to estimate row count. + * @return estimated row count + */ + public static long getIcebergRowCount(HMSExternalTable table) { + long rowCount = 0; + try { + Table icebergTable = HiveMetaStoreClientHelper.getIcebergTable(table); + TableScan tableScan = icebergTable.newScan().includeColumnStats(); + for (FileScanTask task : tableScan.planFiles()) { + rowCount += task.file().recordCount(); + } + return rowCount; + } catch (Exception e) { + LOG.warn("Fail to collect row count for db {} table {}", table.getDbName(), table.getName(), e); + } + return -1; + } + + /** + * Estimate hive table row count : totalFileSize/estimatedRowSize + * @param table Hive HMSExternalTable to estimate row count. + * @return estimated row count + */ + public static long getRowCountFromFileList(HMSExternalTable table) { Review Comment: At least we need to get all the partition values. Then we can randomly choose some of the partitions as sample partition to estimate the row count, assume all partitions contains identical number of rows. In this case we don't need to access all data files in all partitions. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org