Jackie-Jiang commented on code in PR #10027: URL: https://github.com/apache/pinot/pull/10027#discussion_r1060987608
########## pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java: ########## @@ -99,9 +99,16 @@ public enum ControllerGauge implements AbstractMetrics.Gauge { // Number of Tasks Status TASK_STATUS("taskStatus", false), - // Number of dropped minion instances + // Number of dropped stale minion instances DROPPED_MINION_INSTANCES("droppedMinionInstances", true), + // Number of dropped stale broker instances + DROPPED_BROKER_INSTANCES("droppedBrokerInstances", true), + + // Number of dropped stale server instances + DROPPED_SERVER_INSTANCES("droppedServerInstances", true), + + Review Comment: (nit) Remove extra empty line ########## pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/cleanup/StaleInstancesCleanupTask.java: ########## @@ -34,23 +36,30 @@ /** - * A periodic task to clean up offline Minion instances to not spam Helix. + * Automatically removes stale instances from the cluster to not spam Helix. + * Stale instance is the instance not in use (not hosting any data or query) and has been in the offline status for more + * than the stale instance retention time. */ -public class MinionInstancesCleanupTask extends BasePeriodicTask { - private static final Logger LOGGER = LoggerFactory.getLogger(MinionInstancesCleanupTask.class); - private final static String TASK_NAME = "MinionInstancesCleanupTask"; +public class StaleInstancesCleanupTask extends BasePeriodicTask { + private static final Logger LOGGER = LoggerFactory.getLogger(StaleInstancesCleanupTask.class); + private final static String TASK_NAME = "StaleInstancesCleanupTask"; + protected final PinotHelixResourceManager _pinotHelixResourceManager; protected final LeadControllerManager _leadControllerManager; protected final ControllerMetrics _controllerMetrics; + // This applies to both broker and server instances. + private final long _staleInstancesCleanupTaskMinOfflineTimeBeforeDeletionInMilliseconds; private final long _minionInstanceCleanupTaskMinOfflineTimeBeforeDeletionInMilliseconds; Review Comment: Deprecate this one? ########## pinot-controller/src/main/java/org/apache/pinot/controller/ControllerConf.java: ########## @@ -139,6 +139,14 @@ public static class ControllerPeriodicTasksConf { "controller.minion.instances.cleanup.task.minOfflineTimeBeforeDeletionSeconds"; public static final String MINION_INSTANCES_CLEANUP_TASK_MIN_OFFLINE_TIME_BEFORE_DELETION_PERIOD = "controller.minion.instances.cleanup.task.minOfflineTimeBeforeDeletionPeriod"; + + public static final String STALE_INSTANCES_CLEANUP_TASK_FREQUENCY_PERIOD = Review Comment: Add `Deprecated` annotation to old minion cleanup configs? ########## pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/cleanup/StaleInstancesCleanupTask.java: ########## @@ -82,7 +95,47 @@ protected void runTask(Properties periodicTaskProperties) { _controllerMetrics.addValueToGlobalGauge(ControllerGauge.DROPPED_MINION_INSTANCES, 1); } } + continue; + } + + // Drop the broker instance if it has been offline for more than a period of this task. + if (InstanceTypeUtils.isBroker(offlineInstance) && !brokerInstancesInUse.contains(offlineInstance)) { + if (_pinotHelixResourceManager.isInstanceOfflineFor(offlineInstance, + _staleInstancesCleanupTaskMinOfflineTimeBeforeDeletionInMilliseconds)) { + LOGGER.info("Dropping broker instance: {}", offlineInstance); + PinotResourceManagerResponse response = _pinotHelixResourceManager.dropInstance(offlineInstance); + if (response.isSuccessful()) { + _controllerMetrics.addValueToGlobalGauge(ControllerGauge.DROPPED_BROKER_INSTANCES, 1); + } + } + continue; + } + + // Drop the server instance if it has been offline for more than a period of this task. + if (InstanceTypeUtils.isServer(offlineInstance) && !serverInstancesInUse.contains(offlineInstance)) { + if (_pinotHelixResourceManager.isInstanceOfflineFor(offlineInstance, + _staleInstancesCleanupTaskMinOfflineTimeBeforeDeletionInMilliseconds)) { + LOGGER.info("Dropping server instance: {}", offlineInstance); + PinotResourceManagerResponse response = _pinotHelixResourceManager.dropInstance(offlineInstance); + if (response.isSuccessful()) { + _controllerMetrics.addValueToGlobalGauge(ControllerGauge.DROPPED_SERVER_INSTANCES, 1); + } + } } } } + + private Set<String> getBrokerInstancesInUse() { Review Comment: No need to read all tables. Only need to read the `brokerResource` IS. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org