janhoy commented on a change in pull request #1387: SOLR-14210: Include replica health in healtcheck handler URL: https://github.com/apache/lucene-solr/pull/1387#discussion_r403045311
########## File path: solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java ########## @@ -88,15 +98,45 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw return; } - // Set status to true if this node is in live_nodes - if (clusterState.getLiveNodes().contains(cores.getZkController().getNodeName())) { - rsp.add(STATUS, OK); - } else { + // Fail if not in live_nodes + if (!clusterState.getLiveNodes().contains(cores.getZkController().getNodeName())) { rsp.add(STATUS, FAILURE); rsp.setException(new SolrException(SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Host Unavailable: Not in live nodes as per zk")); + return; } - rsp.setHttpCaching(false); + // Optionally require that all cores on this node are active if param 'requireHealthyCores=true' + if (req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES, false)) { + Collection<CloudDescriptor> coreDescriptors = cores.getCores().stream() + .map(c -> c.getCoreDescriptor().getCloudDescriptor()).collect(Collectors.toList()); + List<String> unhealthyCores = findUnhealthyCores(coreDescriptors, clusterState); + if (unhealthyCores.size() > 0) { + rsp.add(STATUS, FAILURE); + rsp.setException(new SolrException(SolrException.ErrorCode.SERVICE_UNAVAILABLE, + "Replica(s) " + unhealthyCores + " are currently initializing or recovering")); + return; + } + rsp.add("message", "All cores are healthy"); + } + + // All lights green, report healthy + rsp.add(STATUS, OK); + } + + /** + * Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not exist on local node. + * We first find local cores which are either not registered or unhealthy, and check each of these against + * the clusterstate, and return a list of unhealthy replicas that are part of an active shard for an existing collection + * @param cores list of core descriptors to iterate + * @param clusterState clusterstate from ZK + * @return list of core names that are either DOWN ore RECOVERING on 'nodeName' + */ + static List<String> findUnhealthyCores(Collection<CloudDescriptor> cores, ClusterState clusterState) { Review comment: @shalinmangar Please have a look at the new logic. If all local cores are active and registered, then we do not consult clusterstate at all. And we only consult clusterstate now to filter out replicas from inactive shards. The only thing I'm unsure of now is whether I feed the correct **slice ID** to `.getActiveSlicesMap().containsKey(**HERE**)`. Is this slice ID the same as `c.getShardId()` on CloudDescriptor? My unit test is a mock, so I cannot be sure :) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org