Adrien Grand created LUCENE-10560:
-------------------------------------

             Summary: Can we speed up OrdinalMap construction?
                 Key: LUCENE-10560
                 URL: https://issues.apache.org/jira/browse/LUCENE-10560
             Project: Lucene - Core
          Issue Type: Improvement
            Reporter: Adrien Grand
         Attachments: profile.png

I've seen a few faceting queries on high-cardinality fields where the 
bottleneck was the construction of the OrdinalMap.

As an example I wrote this basic (and maybe flawed?) benchmark that indexes 
100M docs / 10M unique values that are 16 bytes each and simulates the 
retrieval of the top-10 most frequent values on a MatchAllDocsQuery:
{code:java}
import java.io.IOException;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.SplittableRandom;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.PriorityQueue;

public class GlobalOrds {

  public static void main(String[] args) throws IOException {
    final int numDocs = 10_000_000;
    final int numValues = 1_000_000;
    final int topN = 10;
    byte[][] values = new byte[numValues][];
    byte[] bytes = new byte[16];
    SplittableRandom random = new SplittableRandom(0);
    for (int i = 0; i < values.length; ++i) {
      random.nextBytes(bytes);
      values[i] = bytes.clone();
    }
    Directory dir = FSDirectory.open(Paths.get("/tmp/a"));
    TieredMergePolicy tmp = new TieredMergePolicy();
    // 0.2MB min segment size, trying to simulate the case when documents are 
~10x larger than this single field
    tmp.setFloorSegmentMB(0.2);
    IndexWriter w = new IndexWriter(dir, new 
IndexWriterConfig().setMaxBufferedDocs(numDocs / 
1000).setOpenMode(OpenMode.CREATE));
    Document doc = new Document();
    SortedDocValuesField field = new SortedDocValuesField("field", new 
BytesRef());
    doc.add(field);
    for (int i = 0; i < numDocs; ++i) {
      field.setBytesValue(values[random.nextInt(values.length)]);
      w.addDocument(doc);
    }
    DirectoryReader reader = DirectoryReader.open(w);
    IndexSearcher searcher = new IndexSearcher(reader);
    for (int i = 0; i < 1000; ++i) {
      search(searcher, topN);
    }
    w.close();
    reader.close();
    dir.close();
  }

  static Object DUMMY; // prevent the JVM from optimizing away some stuff

  private static void search(IndexSearcher searcher, int topN) throws 
IOException {
    SortedDocValues[] valuesArray = new 
SortedDocValues[searcher.getIndexReader().leaves().size()];
    for (LeafReaderContext context : searcher.getIndexReader().leaves()) {
      valuesArray[context.ord] = context.reader().getSortedDocValues("field");
    }
    long start = System.nanoTime();
    OrdinalMap globalOrds = OrdinalMap.build(null, valuesArray, 0f);
    System.out.println("Build global ords: " + (System.nanoTime() - start));
    int[] globalCounts = new int[Math.toIntExact(globalOrds.getValueCount())];
    int[] localCounts = IntsRef.EMPTY_INTS;
    start = System.nanoTime();
    for (LeafReaderContext context : searcher.getIndexReader().leaves()) {
      SortedDocValues values = context.reader().getSortedDocValues("field");
      int valueCount = values.getValueCount();
      if (localCounts.length < valueCount) {
        localCounts = new int[ArrayUtil.oversize(valueCount, Integer.BYTES)];
      } else {
        Arrays.fill(localCounts, 0, valueCount, 0);
      }
      for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; 
doc = values.nextDoc()) {
        localCounts[values.ordValue()]++;
      }
      LongValues segmentToGlobal = globalOrds.getGlobalOrds(context.ord);
      for (int i = 0; i < valueCount; ++i) {
        globalCounts[Math.toIntExact(segmentToGlobal.get(i))] += localCounts[i];
      }
    }
    System.out.println("Compute global counts: " + (System.nanoTime() - start));
    start = System.nanoTime();
    PriorityQueue<int[]> topCounts = new PriorityQueue<int[]>(topN) {
      @Override
      protected boolean lessThan(int[] a, int[] b) {
        if (a[1] < b[1]) {
          return true;
        } else if (a[1] == b[1]) {
          return a[0] < b[0];
        } else {
          return false;
        }
      }
    };
    for (int i = 0; i < globalCounts.length; ++i) {
      if (topCounts.size() < topN) {
        topCounts.add(new int[] {i, globalCounts[i]});
      } else {
        int[] top = topCounts.top();
        if (globalCounts[i] > top[1]) {
          top[0] = i;
          top[1] = globalCounts[i];
          topCounts.updateTop();
        }
      }
    }
    System.out.println("Select top N global ords: " + (System.nanoTime() - 
start));
    start = System.nanoTime();
    Map<BytesRef, Integer> topTerms = new HashMap<>();
    for (int[] entry : topCounts) {
      final int ord = entry[0];
      final int count = entry[1];
      final int segmentOrd = (int) globalOrds.getFirstSegmentOrd(ord);
      final int segmentNum = globalOrds.getFirstSegmentNumber(ord);
      BytesRef value = valuesArray[segmentNum].lookupOrd(segmentOrd);
      topTerms.put(value.clone(), count);
    }
    System.out.println("Retrieval of top-N values: " + (System.nanoTime() - 
start));
    DUMMY = topTerms;
  }

}
{code}
On my machine, it takes roughly 4x more time to construct the ordinal map than 
to compute the count for each global ordinal (9 seconds vs. 2.3 seconds). And 
unfortunately, this gets worse when the query gets selective, e.g. a query that 
matches 1M docs (1% of the index) could see the almost entire time spent in the 
ordinal map construction.

Can we speed it up? Are there better ways to compute top-k facets over a 
high-cardinality field?

For what it's worth, async_profiler tells me that most of the time is spent in 
PriorityQueue#downHeap and BytesRef#compare:

!profile.png!



--
This message was sent by Atlassian Jira
(v8.20.7#820007)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to