morningman commented on code in PR #21207: URL: https://github.com/apache/doris/pull/21207#discussion_r1243407638
########## fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java: ########## @@ -348,6 +375,11 @@ public Partition getPartition(List<String> partitionValues) { return client.getPartition(dbName, name, partitionValues); } + public List<Partition> getPartitionsByFilter(String filter) { Review Comment: unused? ########## fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java: ########## @@ -433,5 +467,83 @@ private void initPartitionColumns(List<Column> schema) { LOG.debug("get {} partition columns for table: {}", partitionColumns.size(), name); } + private long getHiveRowCount() { + Map<String, String> parameters = remoteTable.getParameters(); + if (parameters == null) { + return -1; + } + if (parameters.containsKey(NUM_ROWS)) { + return Long.parseLong(parameters.get(NUM_ROWS)); + } + if (!parameters.containsKey(TOTAL_SIZE)) { + return -1; + } + long totalSize = Long.parseLong(parameters.get(TOTAL_SIZE)); + long estimatedRowSize = 0; + for (Column column : getFullSchema()) { + estimatedRowSize += column.getDataType().getSlotSize(); + } + if (estimatedRowSize == 0) { + return 1; + } + return totalSize / estimatedRowSize; + } + + private long getIcebergRowCount() { + long rowCount = 0; + try { + Table icebergTable = HiveMetaStoreClientHelper.getIcebergTable(this); + TableScan tableScan = icebergTable.newScan().includeColumnStats(); + for (FileScanTask task : tableScan.planFiles()) { + rowCount += task.file().recordCount(); + } + return rowCount; + } catch (Exception e) { + LOG.warn(String.format("Fail to collect row count for db %s table %s", dbName, name), e); + } + return -1; + } + + private long getRowCountFromFileList() { Review Comment: This method is costy. ########## fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java: ########## @@ -433,5 +467,83 @@ private void initPartitionColumns(List<Column> schema) { LOG.debug("get {} partition columns for table: {}", partitionColumns.size(), name); } + private long getHiveRowCount() { + Map<String, String> parameters = remoteTable.getParameters(); + if (parameters == null) { + return -1; + } + if (parameters.containsKey(NUM_ROWS)) { + return Long.parseLong(parameters.get(NUM_ROWS)); + } + if (!parameters.containsKey(TOTAL_SIZE)) { + return -1; + } + long totalSize = Long.parseLong(parameters.get(TOTAL_SIZE)); + long estimatedRowSize = 0; + for (Column column : getFullSchema()) { + estimatedRowSize += column.getDataType().getSlotSize(); + } + if (estimatedRowSize == 0) { + return 1; + } + return totalSize / estimatedRowSize; + } + + private long getIcebergRowCount() { Review Comment: 2 issues: 1. I think this method should be move to a separate class, this is not only for `HMSExternalTable`. How about create a new Util class and move all `getRowCountxxx` methods to it. 2. Here you call `icebergTable.newScan()`, so that each query, we need to call `icebergTable.newScan()` twice, one is here, the other is in plan process, which is costy. ########## fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java: ########## @@ -433,5 +467,83 @@ private void initPartitionColumns(List<Column> schema) { LOG.debug("get {} partition columns for table: {}", partitionColumns.size(), name); } + private long getHiveRowCount() { + Map<String, String> parameters = remoteTable.getParameters(); + if (parameters == null) { + return -1; + } + if (parameters.containsKey(NUM_ROWS)) { + return Long.parseLong(parameters.get(NUM_ROWS)); + } + if (!parameters.containsKey(TOTAL_SIZE)) { + return -1; + } + long totalSize = Long.parseLong(parameters.get(TOTAL_SIZE)); + long estimatedRowSize = 0; + for (Column column : getFullSchema()) { + estimatedRowSize += column.getDataType().getSlotSize(); + } + if (estimatedRowSize == 0) { + return 1; + } + return totalSize / estimatedRowSize; + } + + private long getIcebergRowCount() { + long rowCount = 0; + try { + Table icebergTable = HiveMetaStoreClientHelper.getIcebergTable(this); + TableScan tableScan = icebergTable.newScan().includeColumnStats(); + for (FileScanTask task : tableScan.planFiles()) { + rowCount += task.file().recordCount(); + } + return rowCount; + } catch (Exception e) { + LOG.warn(String.format("Fail to collect row count for db %s table %s", dbName, name), e); Review Comment: use `{}` instead of `String.format` to make code simple ########## fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java: ########## @@ -395,7 +427,9 @@ public long estimatedRowCount() { Optional<TableStatistic> tableStatistics = Env.getCurrentEnv().getStatisticsCache().getTableStatistics( catalog.getId(), catalog.getDbOrAnalysisException(dbName).getId(), id); if (tableStatistics.isPresent()) { - return tableStatistics.get().rowCount; + long rowCount = tableStatistics.get().rowCount; + LOG.info(String.format("Estimated row count for db %s table %s is %d.", dbName, name, rowCount)); Review Comment: ```suggestion LOG.debug("Estimated row count for db {} table {} is {}.", dbName, name, rowCount); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org