tflobbe commented on a change in pull request #1297: SOLR-14253 Replace various sleep calls with ZK waits URL: https://github.com/apache/lucene-solr/pull/1297#discussion_r385436094
########## File path: solr/core/src/java/org/apache/solr/cloud/ZkController.java ########## @@ -1684,58 +1685,37 @@ private void doGetShardIdAndNodeNameProcess(CoreDescriptor cd) { } private void waitForCoreNodeName(CoreDescriptor descriptor) { - int retryCount = 320; - log.debug("look for our core node name"); - while (retryCount-- > 0) { - final DocCollection docCollection = zkStateReader.getClusterState() - .getCollectionOrNull(descriptor.getCloudDescriptor().getCollectionName()); - if (docCollection != null && docCollection.getSlicesMap() != null) { - final Map<String, Slice> slicesMap = docCollection.getSlicesMap(); - for (Slice slice : slicesMap.values()) { - for (Replica replica : slice.getReplicas()) { - // TODO: for really large clusters, we could 'index' on this - - String nodeName = replica.getStr(ZkStateReader.NODE_NAME_PROP); - String core = replica.getStr(ZkStateReader.CORE_NAME_PROP); - - String msgNodeName = getNodeName(); - String msgCore = descriptor.getName(); - - if (msgNodeName.equals(nodeName) && core.equals(msgCore)) { - descriptor.getCloudDescriptor() - .setCoreNodeName(replica.getName()); - getCoreContainer().getCoresLocator().persist(getCoreContainer(), descriptor); - return; - } - } + log.debug("waitForCoreNodeName >>> look for our core node name"); + try { + zkStateReader.waitForState(descriptor.getCollectionName(), 320, TimeUnit.SECONDS, c -> { + String name = ClusterStateMutator.getAssignedCoreNodeName(c, getNodeName(), descriptor.getName()); + if (name == null) { + return false; } - } - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } + descriptor.getCloudDescriptor().setCoreNodeName(name); + return true; + }); + } catch (TimeoutException | InterruptedException e) { + throw new SolrException(ErrorCode.SERVER_ERROR, "Timeout waiting for collection state", e); } + getCoreContainer().getCoresLocator().persist(getCoreContainer(), descriptor); } - private void waitForShardId(CoreDescriptor cd) { + private void waitForShardId(final CoreDescriptor cd) { log.debug("waiting to find shard id in clusterstate for " + cd.getName()); - int retryCount = 320; - while (retryCount-- > 0) { - final String shardId = zkStateReader.getClusterState().getShardId(cd.getCollectionName(), getNodeName(), cd.getName()); - if (shardId != null) { - cd.getCloudDescriptor().setShardId(shardId); - return; - } - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } + try { + zkStateReader.waitForState(cd.getCollectionName(), 320, TimeUnit.SECONDS, c -> { + if (c == null) return false; + final String shardId = c.getShardId(getNodeName(), cd.getName()); + if (shardId != null) { + cd.getCloudDescriptor().setShardId(shardId); + return true; + } + return false; + }); + } catch (TimeoutException | InterruptedException e) { + throw new SolrException(ErrorCode.SERVER_ERROR, "Could not get shard id for core: " + cd.getName()); Review comment: Same as before, we should probably re set the interruption. Also, did you intentionally not wrap the exception? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org