zhaih commented on code in PR #12660: URL: https://github.com/apache/lucene/pull/12660#discussion_r1367955218
########## lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java: ########## @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.hnsw; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import static org.apache.lucene.util.hnsw.HnswGraphBuilder.HNSW_COMPONENT; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.util.BitSet; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.InfoStream; +import org.apache.lucene.util.ThreadInterruptedException; + +/** + * A graph builder that manages multiple workers, it only supports adding the whole graph all at + * once. It will spawn a thread for each worker and the workers will pick the work in batches. + */ +public class HnswConcurrentMergeBuilder implements IHnswGraphBuilder { + + private static final int BATCH_SIZE = 2048; + + private final ExecutorService exec; + private final ConcurrentMergeWorker[] workers; + private InfoStream infoStream = InfoStream.getDefault(); + + public HnswConcurrentMergeBuilder( + ExecutorService exec, + int numWorker, + RandomVectorScorerSupplier scorerSupplier, + int M, + int beamWidth, + OnHeapHnswGraph hnsw, + BitSet initializedNodes) + throws IOException { + this.exec = exec; + AtomicInteger workProgress = new AtomicInteger(0); + workers = new ConcurrentMergeWorker[numWorker]; + for (int i = 0; i < numWorker; i++) { + workers[i] = + new ConcurrentMergeWorker( + scorerSupplier.copy(), + M, + beamWidth, + HnswGraphBuilder.randSeed, + hnsw, + initializedNodes, + workProgress); + } + } + + @Override + public OnHeapHnswGraph build(int maxOrd) throws IOException { + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message( + HNSW_COMPONENT, + "build graph from " + maxOrd + " vectors, with " + workers.length + " workers"); + } + List<Future<?>> futures = new ArrayList<>(); + for (int i = 0; i < workers.length; i++) { + int finalI = i; + futures.add( + exec.submit( + () -> { + try { + workers[finalI].run(maxOrd); + } catch (IOException e) { + throw new RuntimeException(e); + } + })); + } + Throwable exc = null; + for (Future<?> future : futures) { + try { + future.get(); + } catch (InterruptedException e) { + var newException = new ThreadInterruptedException(e); + if (exc == null) { + exc = newException; + } else { + exc.addSuppressed(newException); + } + } catch (ExecutionException e) { + if (exc == null) { + exc = e.getCause(); + } else { + exc.addSuppressed(e.getCause()); + } + } + } + if (exc != null) { + // The error handling was copied from TaskExecutor. should we just use TaskExecutor instead? + throw IOUtils.rethrowAlways(exc); + } + return workers[0].getGraph(); + } + + @Override + public void addGraphNode(int node) throws IOException { + throw new UnsupportedOperationException("This builder is for merge only"); + } + + @Override + public void setInfoStream(InfoStream infoStream) { + this.infoStream = infoStream; + for (IHnswGraphBuilder worker : workers) { + worker.setInfoStream(infoStream); + } + } + + @Override + public OnHeapHnswGraph getGraph() { + return workers[0].getGraph(); + } + + private static final class ConcurrentMergeWorker extends HnswGraphBuilder { + + private final AtomicInteger workProgress; + private final BitSet initializedNodes; + + private ConcurrentMergeWorker( + RandomVectorScorerSupplier scorerSupplier, + int M, + int beamWidth, + long seed, + OnHeapHnswGraph hnsw, + BitSet initializedNodes, + AtomicInteger workProgress) + throws IOException { + super( + scorerSupplier, + M, + beamWidth, + seed, + hnsw, + new MergeSearcher( + new NeighborQueue(beamWidth, true), new FixedBitSet(hnsw.maxNodeId() + 1))); + this.workProgress = workProgress; + this.initializedNodes = initializedNodes; + } + + private void run(int maxOrd) throws IOException { Review Comment: I think this gives a better load balance, if we divide it upfront, say we give 1/8 number of docs to each thread (like what I did before) then there will likely be some thread finishing way earlier and just wait at the end. Or if we spawn a lot of thread and each give them 2048 docs, I'm worrying it will spend too much memory at last and will make GC unhappy. This seems to be the best way of doing it IMO, and it's not too complex -- just do a CAS and no more sync is necessary. And in fact this does give a better performance, previously for 100k docs merge the merge time is around 20k ms, and now it's around 18k ms (in the description see the one with contention time shown) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org