janhoy commented on a change in pull request #1387: SOLR-14210: Include replica health in healtcheck handler URL: https://github.com/apache/lucene-solr/pull/1387#discussion_r402344025
########## File path: solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java ########## @@ -88,15 +96,46 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw return; } - // Set status to true if this node is in live_nodes - if (clusterState.getLiveNodes().contains(cores.getZkController().getNodeName())) { - rsp.add(STATUS, OK); - } else { + // Fail if not in live_nodes + if (!clusterState.getLiveNodes().contains(cores.getZkController().getNodeName())) { rsp.add(STATUS, FAILURE); rsp.setException(new SolrException(SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Host Unavailable: Not in live nodes as per zk")); + return; } - rsp.setHttpCaching(false); + // Optionally require that all cores on this node are active if param 'requireHealthyCores=true' + if (req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES, false)) { + List<String> unhealthyCores = findUnhealthyCores(clusterState, + cores.getNodeConfig().getNodeName(), + cores.getAllCoreNames()); + if (unhealthyCores.size() > 0) { + rsp.add(STATUS, FAILURE); + rsp.setException(new SolrException(SolrException.ErrorCode.SERVICE_UNAVAILABLE, + "Replica(s) " + unhealthyCores + " are currently initializing or recovering")); + return; + } + rsp.add("message", "All cores are healthy"); + } + + // All lights green, report healthy + rsp.add(STATUS, OK); + } + + /** + * Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not exist on local node + * @param clusterState clusterstate from ZK + * @param nodeName this node name + * @param allCoreNames list of all core names on current node + * @return list of core names that are either DOWN ore RECOVERING on 'nodeName' + */ + static List<String> findUnhealthyCores(ClusterState clusterState, String nodeName, Collection<String> allCoreNames) { + return clusterState.getCollectionsMap().values().stream() Review comment: I assumed ClusterState object in each node is cached on the node and iterating it will not incur any new ZK calls, but it is uptated by watches? If it incurs connections then I agree with you! I want to exclude replicas of inactive shards from the check. The only place I could find that info was in Slice inside Clusterstate. Sure, I can iterate each core on local host, find its Slice-ID and then go lookup the Slice in clusterstate to find whether it's active, that was my other alternative but more code. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org