mcvsubbu commented on a change in pull request #7122: URL: https://github.com/apache/incubator-pinot/pull/7122#discussion_r668223484
########## File path: pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java ########## @@ -461,6 +462,34 @@ public synchronized PinotResourceManagerResponse updateInstanceTags(String insta return PinotResourceManagerResponse.SUCCESS; } + /** + * Validates whether an instance is offline. + * Since ZNodes under "/LIVEINSTANCES" are ephemeral, if there is a ZK session expire (e.g. due to network issue), + * the ZNode under "/LIVEINSTANCES" will be deleted. Thus, such race condition can happen when this task is running. + * In order to double confirm the live status of an instance, the field "LAST_OFFLINE_TIME" in ZNode under + * "/INSTANCES/<instance_id>/HISTORY" needs to be checked. If the value is "-1", that means the instance is ONLINE; + * if the value is a timestamp, that means the instance starts to be OFFLINE since that time. + * @param instanceId instance id + * @param offlineTimeRangeMs the time range that it's valid for an instance to be offline + */ + public boolean isInstanceOffline(String instanceId, long offlineTimeRangeMs) { + // Check if the instance is included in /LIVEINSTANCES + if (_helixDataAccessor.getProperty(_keyBuilder.liveInstance(instanceId)) != null) { + return false; + } + ParticipantHistory participantHistory = _helixDataAccessor.getProperty(_keyBuilder.participantHistory(instanceId)); + long lastOfflineTime = participantHistory.getLastOfflineTime(); + if (lastOfflineTime == -1) { Review comment: I meant if `offlineTimeRangeMs` is any negative value, then don't delete. ########## File path: pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java ########## @@ -461,6 +462,34 @@ public synchronized PinotResourceManagerResponse updateInstanceTags(String insta return PinotResourceManagerResponse.SUCCESS; } + /** + * Validates whether an instance is offline. + * Since ZNodes under "/LIVEINSTANCES" are ephemeral, if there is a ZK session expire (e.g. due to network issue), + * the ZNode under "/LIVEINSTANCES" will be deleted. Thus, such race condition can happen when this task is running. + * In order to double confirm the live status of an instance, the field "LAST_OFFLINE_TIME" in ZNode under + * "/INSTANCES/<instance_id>/HISTORY" needs to be checked. If the value is "-1", that means the instance is ONLINE; + * if the value is a timestamp, that means the instance starts to be OFFLINE since that time. + * @param instanceId instance id + * @param offlineTimeRangeMs the time range that it's valid for an instance to be offline + */ + public boolean isInstanceOffline(String instanceId, long offlineTimeRangeMs) { Review comment: No, it does more than just check for offline. it checks for threshold of offline time. Maybe change it to `isOfflineForTooLong` or something like that, but I will leave it to you. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org