Adrien Grand created LUCENE-10560: ------------------------------------- Summary: Can we speed up OrdinalMap construction? Key: LUCENE-10560 URL: https://issues.apache.org/jira/browse/LUCENE-10560 Project: Lucene - Core Issue Type: Improvement Reporter: Adrien Grand Attachments: profile.png
I've seen a few faceting queries on high-cardinality fields where the bottleneck was the construction of the OrdinalMap. As an example I wrote this basic (and maybe flawed?) benchmark that indexes 100M docs / 10M unique values that are 16 bytes each and simulates the retrieval of the top-10 most frequent values on a MatchAllDocsQuery: {code:java} import java.io.IOException; import java.nio.file.Paths; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.SplittableRandom; import org.apache.lucene.document.Document; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.OrdinalMap; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.TieredMergePolicy; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LongValues; import org.apache.lucene.util.PriorityQueue; public class GlobalOrds { public static void main(String[] args) throws IOException { final int numDocs = 10_000_000; final int numValues = 1_000_000; final int topN = 10; byte[][] values = new byte[numValues][]; byte[] bytes = new byte[16]; SplittableRandom random = new SplittableRandom(0); for (int i = 0; i < values.length; ++i) { random.nextBytes(bytes); values[i] = bytes.clone(); } Directory dir = FSDirectory.open(Paths.get("/tmp/a")); TieredMergePolicy tmp = new TieredMergePolicy(); // 0.2MB min segment size, trying to simulate the case when documents are ~10x larger than this single field tmp.setFloorSegmentMB(0.2); IndexWriter w = new IndexWriter(dir, new IndexWriterConfig().setMaxBufferedDocs(numDocs / 1000).setOpenMode(OpenMode.CREATE)); Document doc = new Document(); SortedDocValuesField field = new SortedDocValuesField("field", new BytesRef()); doc.add(field); for (int i = 0; i < numDocs; ++i) { field.setBytesValue(values[random.nextInt(values.length)]); w.addDocument(doc); } DirectoryReader reader = DirectoryReader.open(w); IndexSearcher searcher = new IndexSearcher(reader); for (int i = 0; i < 1000; ++i) { search(searcher, topN); } w.close(); reader.close(); dir.close(); } static Object DUMMY; // prevent the JVM from optimizing away some stuff private static void search(IndexSearcher searcher, int topN) throws IOException { SortedDocValues[] valuesArray = new SortedDocValues[searcher.getIndexReader().leaves().size()]; for (LeafReaderContext context : searcher.getIndexReader().leaves()) { valuesArray[context.ord] = context.reader().getSortedDocValues("field"); } long start = System.nanoTime(); OrdinalMap globalOrds = OrdinalMap.build(null, valuesArray, 0f); System.out.println("Build global ords: " + (System.nanoTime() - start)); int[] globalCounts = new int[Math.toIntExact(globalOrds.getValueCount())]; int[] localCounts = IntsRef.EMPTY_INTS; start = System.nanoTime(); for (LeafReaderContext context : searcher.getIndexReader().leaves()) { SortedDocValues values = context.reader().getSortedDocValues("field"); int valueCount = values.getValueCount(); if (localCounts.length < valueCount) { localCounts = new int[ArrayUtil.oversize(valueCount, Integer.BYTES)]; } else { Arrays.fill(localCounts, 0, valueCount, 0); } for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { localCounts[values.ordValue()]++; } LongValues segmentToGlobal = globalOrds.getGlobalOrds(context.ord); for (int i = 0; i < valueCount; ++i) { globalCounts[Math.toIntExact(segmentToGlobal.get(i))] += localCounts[i]; } } System.out.println("Compute global counts: " + (System.nanoTime() - start)); start = System.nanoTime(); PriorityQueue<int[]> topCounts = new PriorityQueue<int[]>(topN) { @Override protected boolean lessThan(int[] a, int[] b) { if (a[1] < b[1]) { return true; } else if (a[1] == b[1]) { return a[0] < b[0]; } else { return false; } } }; for (int i = 0; i < globalCounts.length; ++i) { if (topCounts.size() < topN) { topCounts.add(new int[] {i, globalCounts[i]}); } else { int[] top = topCounts.top(); if (globalCounts[i] > top[1]) { top[0] = i; top[1] = globalCounts[i]; topCounts.updateTop(); } } } System.out.println("Select top N global ords: " + (System.nanoTime() - start)); start = System.nanoTime(); Map<BytesRef, Integer> topTerms = new HashMap<>(); for (int[] entry : topCounts) { final int ord = entry[0]; final int count = entry[1]; final int segmentOrd = (int) globalOrds.getFirstSegmentOrd(ord); final int segmentNum = globalOrds.getFirstSegmentNumber(ord); BytesRef value = valuesArray[segmentNum].lookupOrd(segmentOrd); topTerms.put(value.clone(), count); } System.out.println("Retrieval of top-N values: " + (System.nanoTime() - start)); DUMMY = topTerms; } } {code} On my machine, it takes roughly 4x more time to construct the ordinal map than to compute the count for each global ordinal (9 seconds vs. 2.3 seconds). And unfortunately, this gets worse when the query gets selective, e.g. a query that matches 1M docs (1% of the index) could see the almost entire time spent in the ordinal map construction. Can we speed it up? Are there better ways to compute top-k facets over a high-cardinality field? For what it's worth, async_profiler tells me that most of the time is spent in PriorityQueue#downHeap and BytesRef#compare: !profile.png! -- This message was sent by Atlassian Jira (v8.20.7#820007) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org