Re: [PR] Add levels to DocValues skipper index [lucene]

via GitHub Wed, 17 Jul 2024 06:01:17 -0700


jpountz commented on code in PR #13563:
URL: https://github.com/apache/lucene/pull/13563#discussion_r1680897686



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java:
##########
@@ -207,65 +210,133 @@ void accumulate(long value) {
       maxValue = Math.max(maxValue, value);
     }
 
+    void accumulate(SkipAccumulator other) {
+      minDocID = Math.min(minDocID, other.minDocID);
+      maxDocID = Math.max(maxDocID, other.maxDocID);
+      minValue = Math.min(minValue, other.minValue);
+      maxValue = Math.max(maxValue, other.maxValue);
+      docCount += other.docCount;
+    }
+
     void nextDoc(int docID) {
       maxDocID = docID;
       ++docCount;
     }
 
-    void writeTo(DataOutput output) throws IOException {
-      output.writeInt(maxDocID);
-      output.writeInt(minDocID);
-      output.writeLong(maxValue);
-      output.writeLong(minValue);
-      output.writeInt(docCount);
+    public static SkipAccumulator merge(List<SkipAccumulator> list, int index, 
int length) {
+      SkipAccumulator acc = new SkipAccumulator(list.get(index).minDocID);
+      for (int i = 0; i < length; i++) {
+        acc.accumulate(list.get(index + i));
+      }
+      return acc;
     }
   }
 
   private void writeSkipIndex(FieldInfo field, DocValuesProducer 
valuesProducer)
       throws IOException {
     assert field.hasDocValuesSkipIndex();
-    // TODO: This disk compression once we introduce levels
-    long start = data.getFilePointer();
-    SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
+    final long start = data.getFilePointer();
+    final SortedNumericDocValues values = 
valuesProducer.getSortedNumeric(field);
     long globalMaxValue = Long.MIN_VALUE;
     long globalMinValue = Long.MAX_VALUE;
     int globalDocCount = 0;
     int maxDocId = -1;
+    final List<List<SkipAccumulator>> accumulators = new 
ArrayList<>(SKIP_INDEX_MAX_LEVEL);
+    for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) {
+      accumulators.add(new ArrayList<>());
+    }
     SkipAccumulator accumulator = null;
-    int counter = 0;
+    final int maxAccumulators = 1 << (SKIP_INDEX_LEVEL_SHIFT * 
(SKIP_INDEX_MAX_LEVEL - 1));
     for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc 
= values.nextDoc()) {
-      if (counter == 0) {
+      if (accumulator == null) {
         accumulator = new SkipAccumulator(doc);
+        accumulators.get(0).add(accumulator);
       }
       accumulator.nextDoc(doc);
       for (int i = 0, end = values.docValueCount(); i < end; ++i) {
         accumulator.accumulate(values.nextValue());
       }
-      if (++counter == skipIndexIntervalSize) {
+      if (accumulator.docCount == skipIndexIntervalSize) {
         globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
         globalMinValue = Math.min(globalMinValue, accumulator.minValue);
         globalDocCount += accumulator.docCount;
         maxDocId = accumulator.maxDocID;
-        accumulator.writeTo(data);
-        counter = 0;
+        accumulator = null;
+        if (accumulators.size() == maxAccumulators) {
+          writeLevels(accumulators);
+          for (List<SkipAccumulator> accumulatorList : accumulators) {
+            accumulatorList.clear();
+          }
+        }
       }
     }
 
-    if (counter > 0) {
-      globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
-      globalMinValue = Math.min(globalMinValue, accumulator.minValue);
-      globalDocCount += accumulator.docCount;
-      maxDocId = accumulator.maxDocID;
-      accumulator.writeTo(data);
+    if (accumulators.isEmpty() == false) {
+      if (accumulator != null) {
+        globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
+        globalMinValue = Math.min(globalMinValue, accumulator.minValue);
+        globalDocCount += accumulator.docCount;
+        maxDocId = accumulator.maxDocID;
+      }
+      writeLevels(accumulators);
     }
     meta.writeLong(start); // record the start in meta
     meta.writeLong(data.getFilePointer() - start); // record the length
+    assert globalDocCount == 0 || globalMaxValue >= globalMinValue;
     meta.writeLong(globalMaxValue);
     meta.writeLong(globalMinValue);
+    assert globalDocCount <= maxDocId + 1;
     meta.writeInt(globalDocCount);
     meta.writeInt(maxDocId);
   }
 
+  private void writeLevels(List<List<SkipAccumulator>> accumulators) throws 
IOException {
+    for (int i = 1; i < accumulators.size(); i++) {
+      buildLevel(accumulators.get(i), accumulators.get(i - 1));
+    }
+    int totalAccumulators = accumulators.get(0).size();
+    for (int index = 0; index < totalAccumulators; index++) {
+      // compute how many levels we need to write for the current accumulator
+      final int levels = getLevels(index, totalAccumulators);
+      // build the levels
+      final SkipAccumulator[] accLevels = new SkipAccumulator[levels];
+      for (int level = 0; level < levels; level++) {
+        accLevels[level] =
+            accumulators.get(level).get(index / (1 << (SKIP_INDEX_LEVEL_SHIFT 
* level)));

Review Comment:
   Nit: I believe that this could be simplified to:
   
   ```suggestion
               accumulators.get(level).get(index >> (SKIP_INDEX_LEVEL_SHIFT * 
level));
   ```



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java:
##########
@@ -207,65 +210,133 @@ void accumulate(long value) {
       maxValue = Math.max(maxValue, value);
     }
 
+    void accumulate(SkipAccumulator other) {
+      minDocID = Math.min(minDocID, other.minDocID);
+      maxDocID = Math.max(maxDocID, other.maxDocID);

Review Comment:
   Nit: It looks like we always accumulate in doc ID order, so we could just do 
`maxDocID = other.maxDocID` (and assert that `minDocID < other.minDocId && 
maxDocId < other.maxDocID`)?



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java:
##########
@@ -1792,61 +1794,91 @@ public DocValuesSkipper getSkipper(FieldInfo field) 
throws IOException {
     if (input.length() > 0) {
       input.prefetch(0, 1);
     }
+    // TODO: should we write to disk the actual max level for this segment?
     return new DocValuesSkipper() {
-      int minDocID = -1;
-      int maxDocID = -1;
-      long minValue, maxValue;
-      int docCount;
+      final int[] minDocID = new int[SKIP_INDEX_MAX_LEVEL];
+      final int[] maxDocID = new int[SKIP_INDEX_MAX_LEVEL];
+
+      {
+        for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) {
+          minDocID[i] = maxDocID[i] = -1;
+        }
+      }
+
+      final long[] minValue = new long[SKIP_INDEX_MAX_LEVEL];
+      final long[] maxValue = new long[SKIP_INDEX_MAX_LEVEL];
+      final int[] docCount = new int[SKIP_INDEX_MAX_LEVEL];
+      int levels;
 
       @Override
       public void advance(int target) throws IOException {
         if (target > entry.maxDocId) {
-          minDocID = DocIdSetIterator.NO_MORE_DOCS;
-          maxDocID = DocIdSetIterator.NO_MORE_DOCS;
+          // skipper is exhausted
+          for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) {
+            minDocID[i] = maxDocID[i] = DocIdSetIterator.NO_MORE_DOCS;
+          }
         } else {
+          // find next interval
+          assert target > maxDocID[0] : "target must be bigger that current 
interval";
           while (true) {
-            maxDocID = input.readInt();
-            if (maxDocID >= target) {
-              minDocID = input.readInt();
-              maxValue = input.readLong();
-              minValue = input.readLong();
-              docCount = input.readInt();
+            levels = input.readByte();
+            assert levels <= SKIP_INDEX_MAX_LEVEL && levels > 0
+                : "level out of range [" + levels + "]";
+            boolean competitive = true;
+            // check if current interval is competitive or we can jump to the 
next position
+            for (int level = levels - 1; level >= 0; level--) {
+              if ((maxDocID[level] = input.readInt()) < target) {
+                input.skipBytes(SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level]); // 
the jump for the level
+                competitive = false;
+                break;
+              }
+              minDocID[level] = input.readInt();
+              maxValue[level] = input.readLong();
+              minValue[level] = input.readLong();
+              docCount[level] = input.readInt();
+            }
+            if (competitive) {
+              // adjust levels
+              while (levels < SKIP_INDEX_MAX_LEVEL) {
+                if (maxDocID[levels] == -1 || maxDocID[levels] < target) {

Review Comment:
   I believe that the second condition is a superset of the first condition, so 
we could skip the first one?
   
   ```suggestion
                   if (maxDocID[levels] < target) {
   ```



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java:
##########
@@ -207,65 +210,133 @@ void accumulate(long value) {
       maxValue = Math.max(maxValue, value);
     }
 
+    void accumulate(SkipAccumulator other) {
+      minDocID = Math.min(minDocID, other.minDocID);
+      maxDocID = Math.max(maxDocID, other.maxDocID);
+      minValue = Math.min(minValue, other.minValue);
+      maxValue = Math.max(maxValue, other.maxValue);
+      docCount += other.docCount;
+    }
+
     void nextDoc(int docID) {
       maxDocID = docID;
       ++docCount;
     }
 
-    void writeTo(DataOutput output) throws IOException {
-      output.writeInt(maxDocID);
-      output.writeInt(minDocID);
-      output.writeLong(maxValue);
-      output.writeLong(minValue);
-      output.writeInt(docCount);
+    public static SkipAccumulator merge(List<SkipAccumulator> list, int index, 
int length) {
+      SkipAccumulator acc = new SkipAccumulator(list.get(index).minDocID);
+      for (int i = 0; i < length; i++) {
+        acc.accumulate(list.get(index + i));
+      }
+      return acc;
     }
   }
 
   private void writeSkipIndex(FieldInfo field, DocValuesProducer 
valuesProducer)
       throws IOException {
     assert field.hasDocValuesSkipIndex();
-    // TODO: This disk compression once we introduce levels
-    long start = data.getFilePointer();
-    SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
+    final long start = data.getFilePointer();
+    final SortedNumericDocValues values = 
valuesProducer.getSortedNumeric(field);
     long globalMaxValue = Long.MIN_VALUE;
     long globalMinValue = Long.MAX_VALUE;
     int globalDocCount = 0;
     int maxDocId = -1;
+    final List<List<SkipAccumulator>> accumulators = new 
ArrayList<>(SKIP_INDEX_MAX_LEVEL);
+    for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) {
+      accumulators.add(new ArrayList<>());
+    }
     SkipAccumulator accumulator = null;
-    int counter = 0;
+    final int maxAccumulators = 1 << (SKIP_INDEX_LEVEL_SHIFT * 
(SKIP_INDEX_MAX_LEVEL - 1));
     for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc 
= values.nextDoc()) {
-      if (counter == 0) {
+      if (accumulator == null) {
         accumulator = new SkipAccumulator(doc);
+        accumulators.get(0).add(accumulator);
       }
       accumulator.nextDoc(doc);
       for (int i = 0, end = values.docValueCount(); i < end; ++i) {
         accumulator.accumulate(values.nextValue());
       }
-      if (++counter == skipIndexIntervalSize) {
+      if (accumulator.docCount == skipIndexIntervalSize) {
         globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
         globalMinValue = Math.min(globalMinValue, accumulator.minValue);
         globalDocCount += accumulator.docCount;
         maxDocId = accumulator.maxDocID;
-        accumulator.writeTo(data);
-        counter = 0;
+        accumulator = null;
+        if (accumulators.size() == maxAccumulators) {
+          writeLevels(accumulators);
+          for (List<SkipAccumulator> accumulatorList : accumulators) {
+            accumulatorList.clear();
+          }
+        }
       }
     }
 
-    if (counter > 0) {
-      globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
-      globalMinValue = Math.min(globalMinValue, accumulator.minValue);
-      globalDocCount += accumulator.docCount;
-      maxDocId = accumulator.maxDocID;
-      accumulator.writeTo(data);
+    if (accumulators.isEmpty() == false) {
+      if (accumulator != null) {
+        globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
+        globalMinValue = Math.min(globalMinValue, accumulator.minValue);
+        globalDocCount += accumulator.docCount;
+        maxDocId = accumulator.maxDocID;
+      }
+      writeLevels(accumulators);
     }
     meta.writeLong(start); // record the start in meta
     meta.writeLong(data.getFilePointer() - start); // record the length
+    assert globalDocCount == 0 || globalMaxValue >= globalMinValue;
     meta.writeLong(globalMaxValue);
     meta.writeLong(globalMinValue);
+    assert globalDocCount <= maxDocId + 1;
     meta.writeInt(globalDocCount);
     meta.writeInt(maxDocId);
   }
 
+  private void writeLevels(List<List<SkipAccumulator>> accumulators) throws 
IOException {
+    for (int i = 1; i < accumulators.size(); i++) {
+      buildLevel(accumulators.get(i), accumulators.get(i - 1));
+    }
+    int totalAccumulators = accumulators.get(0).size();
+    for (int index = 0; index < totalAccumulators; index++) {
+      // compute how many levels we need to write for the current accumulator
+      final int levels = getLevels(index, totalAccumulators);
+      // build the levels
+      final SkipAccumulator[] accLevels = new SkipAccumulator[levels];

Review Comment:
   Nit: it looks like we don't actually need to copy them to an array and we 
could instead dynamically compute the index to look up in the for loop that 
writes levels?



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java:
##########
@@ -207,65 +210,133 @@ void accumulate(long value) {
       maxValue = Math.max(maxValue, value);
     }
 
+    void accumulate(SkipAccumulator other) {
+      minDocID = Math.min(minDocID, other.minDocID);
+      maxDocID = Math.max(maxDocID, other.maxDocID);
+      minValue = Math.min(minValue, other.minValue);
+      maxValue = Math.max(maxValue, other.maxValue);
+      docCount += other.docCount;
+    }
+
     void nextDoc(int docID) {
       maxDocID = docID;
       ++docCount;
     }
 
-    void writeTo(DataOutput output) throws IOException {
-      output.writeInt(maxDocID);
-      output.writeInt(minDocID);
-      output.writeLong(maxValue);
-      output.writeLong(minValue);
-      output.writeInt(docCount);
+    public static SkipAccumulator merge(List<SkipAccumulator> list, int index, 
int length) {
+      SkipAccumulator acc = new SkipAccumulator(list.get(index).minDocID);
+      for (int i = 0; i < length; i++) {
+        acc.accumulate(list.get(index + i));
+      }
+      return acc;
     }
   }
 
   private void writeSkipIndex(FieldInfo field, DocValuesProducer 
valuesProducer)
       throws IOException {
     assert field.hasDocValuesSkipIndex();
-    // TODO: This disk compression once we introduce levels
-    long start = data.getFilePointer();
-    SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
+    final long start = data.getFilePointer();
+    final SortedNumericDocValues values = 
valuesProducer.getSortedNumeric(field);
     long globalMaxValue = Long.MIN_VALUE;
     long globalMinValue = Long.MAX_VALUE;
     int globalDocCount = 0;
     int maxDocId = -1;
+    final List<List<SkipAccumulator>> accumulators = new 
ArrayList<>(SKIP_INDEX_MAX_LEVEL);

Review Comment:
   Nit: For simplicity, maybe this could be just a list of accumulators 
(instead of a list of lists) and we'd create the list of lists in `writeLevels` 
since lists at indexes 1 or more are only used in `writeLevels`?



##########
lucene/core/src/java/org/apache/lucene/index/CheckIndex.java:
##########
@@ -3301,17 +3301,17 @@ private static void checkDocValueSkipper(FieldInfo fi, 
DocValuesSkipper skipper)
       if (skipper.maxDocID(0) == NO_MORE_DOCS) {
         break;
       }
+      if (skipper.minDocID(0) < doc) {
+        throw new CheckIndexException(
+            "skipper dv iterator for field: "
+                + fieldName
+                + " reports wrong minDocID, got "
+                + skipper.minDocID(0)
+                + " < "
+                + doc);
+      }
       int levels = skipper.numLevels();
       for (int level = 0; level < levels; level++) {
-        if (skipper.minDocID(level) < doc) {
-          throw new CheckIndexException(
-              "skipper dv iterator for field: "
-                  + fieldName
-                  + " reports wrong minDocID, got "
-                  + skipper.minDocID(level)
-                  + " < "
-                  + doc);
-        }

Review Comment:
   What was wrong with having this check on every level?



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java:
##########
@@ -207,65 +210,133 @@ void accumulate(long value) {
       maxValue = Math.max(maxValue, value);
     }
 
+    void accumulate(SkipAccumulator other) {
+      minDocID = Math.min(minDocID, other.minDocID);
+      maxDocID = Math.max(maxDocID, other.maxDocID);
+      minValue = Math.min(minValue, other.minValue);
+      maxValue = Math.max(maxValue, other.maxValue);
+      docCount += other.docCount;
+    }
+
     void nextDoc(int docID) {
       maxDocID = docID;
       ++docCount;
     }
 
-    void writeTo(DataOutput output) throws IOException {
-      output.writeInt(maxDocID);
-      output.writeInt(minDocID);
-      output.writeLong(maxValue);
-      output.writeLong(minValue);
-      output.writeInt(docCount);
+    public static SkipAccumulator merge(List<SkipAccumulator> list, int index, 
int length) {
+      SkipAccumulator acc = new SkipAccumulator(list.get(index).minDocID);
+      for (int i = 0; i < length; i++) {
+        acc.accumulate(list.get(index + i));
+      }
+      return acc;
     }
   }
 
   private void writeSkipIndex(FieldInfo field, DocValuesProducer 
valuesProducer)
       throws IOException {
     assert field.hasDocValuesSkipIndex();
-    // TODO: This disk compression once we introduce levels
-    long start = data.getFilePointer();
-    SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
+    final long start = data.getFilePointer();
+    final SortedNumericDocValues values = 
valuesProducer.getSortedNumeric(field);
     long globalMaxValue = Long.MIN_VALUE;
     long globalMinValue = Long.MAX_VALUE;
     int globalDocCount = 0;
     int maxDocId = -1;
+    final List<List<SkipAccumulator>> accumulators = new 
ArrayList<>(SKIP_INDEX_MAX_LEVEL);
+    for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) {
+      accumulators.add(new ArrayList<>());
+    }
     SkipAccumulator accumulator = null;
-    int counter = 0;
+    final int maxAccumulators = 1 << (SKIP_INDEX_LEVEL_SHIFT * 
(SKIP_INDEX_MAX_LEVEL - 1));
     for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc 
= values.nextDoc()) {
-      if (counter == 0) {
+      if (accumulator == null) {
         accumulator = new SkipAccumulator(doc);
+        accumulators.get(0).add(accumulator);
       }
       accumulator.nextDoc(doc);
       for (int i = 0, end = values.docValueCount(); i < end; ++i) {
         accumulator.accumulate(values.nextValue());
       }
-      if (++counter == skipIndexIntervalSize) {
+      if (accumulator.docCount == skipIndexIntervalSize) {
         globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
         globalMinValue = Math.min(globalMinValue, accumulator.minValue);
         globalDocCount += accumulator.docCount;
         maxDocId = accumulator.maxDocID;
-        accumulator.writeTo(data);
-        counter = 0;
+        accumulator = null;
+        if (accumulators.size() == maxAccumulators) {

Review Comment:
   Should it actually be `accumulators.get(0).size() == maxAccumulators`?



##########
lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java:
##########
@@ -1194,24 +1194,27 @@ public int numLevels() {
     @Override
     public int minDocID(int level) {
       assertThread("Doc values skipper", creationThread);
-      Objects.checkIndex(level, numLevels());
       int minDocID = in.minDocID(level);
       assert minDocID <= in.maxDocID(level);
-      if (level > 0) {
-        assert minDocID <= in.minDocID(level - 1);
+      if (minDocID != -1 && minDocID != DocIdSetIterator.NO_MORE_DOCS) {
+        Objects.checkIndex(level, numLevels());
+      }
+      for (int i = 0; i < level; i++) {
+        assert minDocID >= in.minDocID(i);

Review Comment:
   The check made a bit more sense to me before. Is the problem that we return 
a `numLevels()` of 0 when the iterator is unpositioned or exhausted?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Add levels to DocValues skipper index [lucene]

Reply via email to