gf2121 commented on code in PR #12800:
URL: https://github.com/apache/lucene/pull/12800#discussion_r1392571640


##########
lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/DocSorterBenchmark.java:
##########
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.benchmark.jmh;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BaseLSBRadixSorter;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LongsRef;
+import org.apache.lucene.util.TimSorter;
+import org.apache.lucene.util.packed.PackedInts;
+import org.openjdk.jmh.annotations.*;
+
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+// first iteration is complete garbage, so make sure we really warmup
+@Warmup(iterations = 3, time = 1)
+// real iterations. not useful to spend tons of time here, better to fork more
+@Measurement(iterations = 5, time = 1)
+// engage some noise reduction
+@Fork(
+    value = 1,
+    jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"})
+public class DocSorterBenchmark {
+
+  private DocOffsetTimSorter timSorter;
+  private DocOffsetRadixSorter radixSorter;
+
+  @Param({"100000"})
+  int size;
+
+  @Param({"natural", "reverse", "random", "partial"})
+  String order;
+
+  @Param({"24", "31"})
+  int bit;
+
+  @Setup(Level.Invocation)
+  public void init() {
+    ThreadLocalRandom random = ThreadLocalRandom.current();
+
+    int maxDoc = 1 << (bit - 1);
+    assert PackedInts.bitsRequired(maxDoc) == bit;
+    // random byte arrays for binary methods
+    int[] doc = new int[size];
+    long[] off = new long[size];
+    for (int i = 0; i < size; ++i) {
+      doc[i] = random.nextInt(maxDoc);
+      off[i] = random.nextLong(Long.MAX_VALUE);

Review Comment:
   Thanks for suggestion @mikemccand !
   
   > When possible I much prefer benchmarks based on real data/corpora.
   
   Benchmark on real data is great. Since this PR is optimizing the docIds 
sorting in `SortingTermsEnum`, 
   I guess the factor that determines the doc id distribution is not the data 
itself, but the order in which the sort field is written. I tried the great 
benchmark @jpountz posted in 
https://github.com/apache/lucene/pull/12114#issuecomment-1406248901. The result 
shows ~17% speed up for flushing docs indexed in random order, and ~2% 
regression for asc/desc.
   
   <details><summary>Benchmark Detail</summary>
   
   **Baseline**
   ```
   Using order: RANDOM
   DWPT 0 [2023-11-14T12:52:36.633402Z; main]: flush time 1214.070083 ms
   DWPT 0 [2023-11-14T12:52:38.534253Z; main]: flush time 899.796208 ms
   DWPT 0 [2023-11-14T12:52:40.119878Z; main]: flush time 799.165958 ms
   DWPT 0 [2023-11-14T12:52:41.562375Z; main]: flush time 654.701 ms
   DWPT 0 [2023-11-14T12:52:42.998111Z; main]: flush time 650.918208 ms
   DWPT 0 [2023-11-14T12:52:44.433020Z; main]: flush time 647.062042 ms
   DWPT 0 [2023-11-14T12:52:45.871244Z; main]: flush time 649.0795 ms
   DWPT 0 [2023-11-14T12:52:47.309565Z; main]: flush time 652.586583 ms
   DWPT 0 [2023-11-14T12:52:48.747658Z; main]: flush time 646.734333 ms
   DWPT 0 [2023-11-14T12:52:50.196127Z; main]: flush time 654.810541 ms
   Using order: ASC
   DWPT 1 [2023-11-14T12:52:51.564601Z; main]: flush time 525.553291 ms
   DWPT 1 [2023-11-14T12:52:52.708573Z; main]: flush time 349.199958 ms
   DWPT 1 [2023-11-14T12:52:53.803105Z; main]: flush time 303.043541 ms
   DWPT 1 [2023-11-14T12:52:54.896132Z; main]: flush time 299.856667 ms
   DWPT 1 [2023-11-14T12:52:55.995053Z; main]: flush time 300.801292 ms
   DWPT 1 [2023-11-14T12:52:57.082434Z; main]: flush time 295.964542 ms
   DWPT 1 [2023-11-14T12:52:58.170703Z; main]: flush time 299.771375 ms
   DWPT 1 [2023-11-14T12:52:59.258876Z; main]: flush time 296.914333 ms
   DWPT 1 [2023-11-14T12:53:00.373431Z; main]: flush time 303.516583 ms
   DWPT 1 [2023-11-14T12:53:01.483840Z; main]: flush time 300.98975 ms
   Using order: DESC
   DWPT 2 [2023-11-14T12:53:02.748401Z; main]: flush time 422.169666 ms
   DWPT 2 [2023-11-14T12:53:03.935493Z; main]: flush time 387.440917 ms
   DWPT 2 [2023-11-14T12:53:05.112949Z; main]: flush time 384.38025 ms
   DWPT 2 [2023-11-14T12:53:06.291248Z; main]: flush time 387.05 ms
   DWPT 2 [2023-11-14T12:53:07.480404Z; main]: flush time 391.184375 ms
   DWPT 2 [2023-11-14T12:53:08.691136Z; main]: flush time 383.774625 ms
   DWPT 2 [2023-11-14T12:53:09.875488Z; main]: flush time 391.470083 ms
   DWPT 2 [2023-11-14T12:53:11.058662Z; main]: flush time 386.547958 ms
   DWPT 2 [2023-11-14T12:53:12.245281Z; main]: flush time 384.264833 ms
   DWPT 2 [2023-11-14T12:53:13.423061Z; main]: flush time 383.677958 ms
   ```
   
   **Candidate**
   ```
   Using order: RANDOM
   DWPT 0 [2023-11-14T12:51:01.872202Z; main]: flush time 1087.182375 ms
   DWPT 0 [2023-11-14T12:51:03.804886Z; main]: flush time 879.66 ms
   DWPT 0 [2023-11-14T12:51:05.190083Z; main]: flush time 603.431084 ms
   DWPT 0 [2023-11-14T12:51:06.508227Z; main]: flush time 536.702625 ms
   DWPT 0 [2023-11-14T12:51:07.826303Z; main]: flush time 537.334458 ms
   DWPT 0 [2023-11-14T12:51:09.145597Z; main]: flush time 534.974792 ms
   DWPT 0 [2023-11-14T12:51:10.479141Z; main]: flush time 540.493958 ms
   DWPT 0 [2023-11-14T12:51:11.807347Z; main]: flush time 540.360834 ms
   DWPT 0 [2023-11-14T12:51:13.135150Z; main]: flush time 532.705959 ms
   DWPT 0 [2023-11-14T12:51:14.452301Z; main]: flush time 533.977417 ms
   Using order: ASC
   DWPT 1 [2023-11-14T12:51:15.752945Z; main]: flush time 476.6145 ms
   DWPT 1 [2023-11-14T12:51:16.841999Z; main]: flush time 310.893584 ms
   DWPT 1 [2023-11-14T12:51:17.941010Z; main]: flush time 320.484458 ms
   DWPT 1 [2023-11-14T12:51:19.032024Z; main]: flush time 306.014333 ms
   DWPT 1 [2023-11-14T12:51:20.121276Z; main]: flush time 306.314708 ms
   DWPT 1 [2023-11-14T12:51:21.203727Z; main]: flush time 301.988375 ms
   DWPT 1 [2023-11-14T12:51:22.295764Z; main]: flush time 304.992917 ms
   DWPT 1 [2023-11-14T12:51:23.383463Z; main]: flush time 304.863875 ms
   DWPT 1 [2023-11-14T12:51:24.466930Z; main]: flush time 305.871583 ms
   DWPT 1 [2023-11-14T12:51:25.554916Z; main]: flush time 308.523833 ms
   Using order: DESC
   DWPT 2 [2023-11-14T12:51:26.776390Z; main]: flush time 400.262708 ms
   DWPT 2 [2023-11-14T12:51:27.954697Z; main]: flush time 399.378459 ms
   DWPT 2 [2023-11-14T12:51:29.135740Z; main]: flush time 399.318917 ms
   DWPT 2 [2023-11-14T12:51:30.313135Z; main]: flush time 393.676084 ms
   DWPT 2 [2023-11-14T12:51:31.516547Z; main]: flush time 407.419875 ms
   DWPT 2 [2023-11-14T12:51:32.692559Z; main]: flush time 392.623833 ms
   DWPT 2 [2023-11-14T12:51:33.880394Z; main]: flush time 399.269125 ms
   DWPT 2 [2023-11-14T12:51:35.051656Z; main]: flush time 393.07625 ms
   DWPT 2 [2023-11-14T12:51:36.223096Z; main]: flush time 391.553584 ms
   DWPT 2 [2023-11-14T12:51:37.404842Z; main]: flush time 392.61575 ms
   ```
   
   </details>
   
   <details><summary>Code</summary>
   
   ```
     enum Order {
       RANDOM,
       ASC,
       DESC;
     }
   
     public static void main(String[] args) throws IOException {
       for (Order order : Order.values()) {
         System.out.println("Using order: " + order.name());
         Directory dir = FSDirectory.open(Paths.get("/tmp/a"));
         IndexWriterConfig cfg = new IndexWriterConfig(new StandardAnalyzer());
         cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
         cfg.setInfoStream(new PrintStreamInfoStream(System.out));
         cfg.setMaxBufferedDocs(1_000_000);
         cfg.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH);
         cfg.setIndexSort(
             new Sort(LongField.newSortField("sort_field", false, 
SortedNumericSelector.Type.MIN)));
         IndexWriter w = new IndexWriter(dir, cfg);
         Document doc = new Document();
         LongField sortField = new LongField("sort_field", 0);
         doc.add(sortField);
         TextField stringField1 = new TextField("string_field", "", 
Field.Store.NO);
         doc.add(stringField1);
         TextField stringField2 = new TextField("string_field", "", 
Field.Store.NO);
         doc.add(stringField2);
         TextField stringField3 = new TextField("string_field", "", 
Field.Store.NO);
         doc.add(stringField3);
         for (int i = 0; i < 10_000_000; ++i) {
           long sortValue =
               switch (order) {
                 case RANDOM -> i % 15;
                 case ASC -> i;
                 case DESC -> -i;
               };
           sortField.setLongValue(sortValue);
           stringField1.setStringValue(Integer.toBinaryString(i % 10));
           stringField2.setStringValue(Integer.toBinaryString(i % 100));
           stringField3.setStringValue(Integer.toBinaryString(i % 1000));
           w.addDocument(doc);
         }
         w.flush();
         w.commit();
         w.close();
       }
     }
   ```
   
   </details>



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to