jpountz commented on code in PR #13563: URL: https://github.com/apache/lucene/pull/13563#discussion_r1682502283
########## lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java: ########## @@ -207,65 +210,127 @@ void accumulate(long value) { maxValue = Math.max(maxValue, value); } + void accumulate(SkipAccumulator other) { + assert minDocID <= other.minDocID && maxDocID < other.maxDocID; + maxDocID = other.maxDocID; + minValue = Math.min(minValue, other.minValue); + maxValue = Math.max(maxValue, other.maxValue); + docCount += other.docCount; + } + void nextDoc(int docID) { maxDocID = docID; ++docCount; } - void writeTo(DataOutput output) throws IOException { - output.writeInt(maxDocID); - output.writeInt(minDocID); - output.writeLong(maxValue); - output.writeLong(minValue); - output.writeInt(docCount); + public static SkipAccumulator merge(List<SkipAccumulator> list, int index, int length) { + SkipAccumulator acc = new SkipAccumulator(list.get(index).minDocID); + for (int i = 0; i < length; i++) { + acc.accumulate(list.get(index + i)); + } + return acc; } } private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { assert field.hasDocValuesSkipIndex(); - // TODO: This disk compression once we introduce levels - long start = data.getFilePointer(); - SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); + final long start = data.getFilePointer(); + final SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); long globalMaxValue = Long.MIN_VALUE; long globalMinValue = Long.MAX_VALUE; int globalDocCount = 0; int maxDocId = -1; + final List<SkipAccumulator> accumulators = new ArrayList<>(); SkipAccumulator accumulator = null; - int counter = 0; + final int maxAccumulators = 1 << (SKIP_INDEX_LEVEL_SHIFT * (SKIP_INDEX_MAX_LEVEL - 1)); for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { - if (counter == 0) { + if (accumulator == null) { accumulator = new SkipAccumulator(doc); + accumulators.add(accumulator); } accumulator.nextDoc(doc); for (int i = 0, end = values.docValueCount(); i < end; ++i) { accumulator.accumulate(values.nextValue()); } - if (++counter == skipIndexIntervalSize) { + if (accumulator.docCount == skipIndexIntervalSize) { globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue); globalMinValue = Math.min(globalMinValue, accumulator.minValue); globalDocCount += accumulator.docCount; maxDocId = accumulator.maxDocID; - accumulator.writeTo(data); - counter = 0; + accumulator = null; + if (accumulators.size() == maxAccumulators) { + writeLevels(accumulators); + accumulators.clear(); + } } } - if (counter > 0) { - globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue); - globalMinValue = Math.min(globalMinValue, accumulator.minValue); - globalDocCount += accumulator.docCount; - maxDocId = accumulator.maxDocID; - accumulator.writeTo(data); + if (accumulators.isEmpty() == false) { + if (accumulator != null) { + globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue); + globalMinValue = Math.min(globalMinValue, accumulator.minValue); + globalDocCount += accumulator.docCount; + maxDocId = accumulator.maxDocID; + } + writeLevels(accumulators); } meta.writeLong(start); // record the start in meta meta.writeLong(data.getFilePointer() - start); // record the length + assert globalDocCount == 0 || globalMaxValue >= globalMinValue; meta.writeLong(globalMaxValue); meta.writeLong(globalMinValue); + assert globalDocCount <= maxDocId + 1; meta.writeInt(globalDocCount); meta.writeInt(maxDocId); } + private void writeLevels(List<SkipAccumulator> accumulators) throws IOException { + final List<List<SkipAccumulator>> accumulatorsLevels = new ArrayList<>(SKIP_INDEX_MAX_LEVEL); + accumulatorsLevels.add(accumulators); + for (int i = 0; i < SKIP_INDEX_MAX_LEVEL - 1; i++) { + accumulatorsLevels.add(buildLevel(accumulatorsLevels.get(i))); + } + int totalAccumulators = accumulators.size(); + for (int index = 0; index < totalAccumulators; index++) { + // compute how many levels we need to write for the current accumulator + final int levels = getLevels(index, totalAccumulators); + // write the number of levels + data.writeByte((byte) levels); + // write intervals in reverse order. This is done so we don't + // need to read all of them in case of slipping + for (int level = levels - 1; level >= 0; level--) { + final SkipAccumulator accumulator = + accumulatorsLevels.get(level).get(index >> (SKIP_INDEX_LEVEL_SHIFT * level)); + data.writeInt(accumulator.maxDocID); + data.writeInt(accumulator.minDocID); + data.writeLong(accumulator.maxValue); + data.writeLong(accumulator.minValue); + data.writeInt(accumulator.docCount); + } + } + } + + private static List<SkipAccumulator> buildLevel(List<SkipAccumulator> accumulators) { + final int levelSize = 1 << SKIP_INDEX_LEVEL_SHIFT; + final List<SkipAccumulator> collector = new ArrayList<>(); + for (int i = 0; i < accumulators.size() - levelSize + 1; i += levelSize) { + collector.add(SkipAccumulator.merge(accumulators, i, levelSize)); + } + return collector; + } + + private int getLevels(int index, int size) { + final int left = size - index; + for (int level = SKIP_INDEX_MAX_LEVEL - 1; level > 0; level--) { + final int numberIntervals = 1 << (SKIP_INDEX_LEVEL_SHIFT * level); + if (left >= numberIntervals && index % numberIntervals == 0) { + return level + 1; + } + } + return 1; Review Comment: There must be a way to do this in constant time rather than linearly with SKIP_INDEX_MAX_LEVEL? ########## lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java: ########## @@ -1792,61 +1794,91 @@ public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { if (input.length() > 0) { input.prefetch(0, 1); } + // TODO: should we write to disk the actual max level for this segment? Review Comment: This, or we can compute it based on maxDoc? ########## lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java: ########## @@ -1792,61 +1794,91 @@ public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { if (input.length() > 0) { input.prefetch(0, 1); } + // TODO: should we write to disk the actual max level for this segment? return new DocValuesSkipper() { - int minDocID = -1; - int maxDocID = -1; - long minValue, maxValue; - int docCount; + final int[] minDocID = new int[SKIP_INDEX_MAX_LEVEL]; + final int[] maxDocID = new int[SKIP_INDEX_MAX_LEVEL]; + + { + for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) { + minDocID[i] = maxDocID[i] = -1; + } + } + + final long[] minValue = new long[SKIP_INDEX_MAX_LEVEL]; + final long[] maxValue = new long[SKIP_INDEX_MAX_LEVEL]; + final int[] docCount = new int[SKIP_INDEX_MAX_LEVEL]; + int levels = 1; @Override public void advance(int target) throws IOException { if (target > entry.maxDocId) { - minDocID = DocIdSetIterator.NO_MORE_DOCS; - maxDocID = DocIdSetIterator.NO_MORE_DOCS; + // skipper is exhausted + for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) { + minDocID[i] = maxDocID[i] = DocIdSetIterator.NO_MORE_DOCS; + } } else { + // find next interval + assert target > maxDocID[0] : "target must be bigger that current interval"; while (true) { - maxDocID = input.readInt(); - if (maxDocID >= target) { - minDocID = input.readInt(); - maxValue = input.readLong(); - minValue = input.readLong(); - docCount = input.readInt(); + levels = input.readByte(); + assert levels <= SKIP_INDEX_MAX_LEVEL && levels > 0 + : "level out of range [" + levels + "]"; + boolean competitive = true; + // check if current interval is competitive or we can jump to the next position + for (int level = levels - 1; level >= 0; level--) { + if ((maxDocID[level] = input.readInt()) < target) { + input.skipBytes(SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level]); // the jump for the level + competitive = false; + break; + } + minDocID[level] = input.readInt(); + maxValue[level] = input.readLong(); + minValue[level] = input.readLong(); + docCount[level] = input.readInt(); + } + if (competitive) { + // adjust levels + while (levels < SKIP_INDEX_MAX_LEVEL) { + if (maxDocID[levels] < target) { + break; + } + levels++; + } Review Comment: Let's combine both conditions under the while condition? ```java while (levels < SKIP_INDEX_MAX_LEVEL && maxDocIDS[levels] < target) { levels++; } ``` ########## lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java: ########## @@ -207,65 +210,127 @@ void accumulate(long value) { maxValue = Math.max(maxValue, value); } + void accumulate(SkipAccumulator other) { + assert minDocID <= other.minDocID && maxDocID < other.maxDocID; + maxDocID = other.maxDocID; + minValue = Math.min(minValue, other.minValue); + maxValue = Math.max(maxValue, other.maxValue); + docCount += other.docCount; + } + void nextDoc(int docID) { maxDocID = docID; ++docCount; } - void writeTo(DataOutput output) throws IOException { - output.writeInt(maxDocID); - output.writeInt(minDocID); - output.writeLong(maxValue); - output.writeLong(minValue); - output.writeInt(docCount); + public static SkipAccumulator merge(List<SkipAccumulator> list, int index, int length) { + SkipAccumulator acc = new SkipAccumulator(list.get(index).minDocID); + for (int i = 0; i < length; i++) { + acc.accumulate(list.get(index + i)); + } + return acc; } } private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { assert field.hasDocValuesSkipIndex(); - // TODO: This disk compression once we introduce levels - long start = data.getFilePointer(); - SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); + final long start = data.getFilePointer(); + final SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); long globalMaxValue = Long.MIN_VALUE; long globalMinValue = Long.MAX_VALUE; int globalDocCount = 0; int maxDocId = -1; + final List<SkipAccumulator> accumulators = new ArrayList<>(); SkipAccumulator accumulator = null; - int counter = 0; + final int maxAccumulators = 1 << (SKIP_INDEX_LEVEL_SHIFT * (SKIP_INDEX_MAX_LEVEL - 1)); for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { - if (counter == 0) { + if (accumulator == null) { accumulator = new SkipAccumulator(doc); + accumulators.add(accumulator); } accumulator.nextDoc(doc); for (int i = 0, end = values.docValueCount(); i < end; ++i) { accumulator.accumulate(values.nextValue()); } - if (++counter == skipIndexIntervalSize) { + if (accumulator.docCount == skipIndexIntervalSize) { globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue); globalMinValue = Math.min(globalMinValue, accumulator.minValue); globalDocCount += accumulator.docCount; maxDocId = accumulator.maxDocID; - accumulator.writeTo(data); - counter = 0; + accumulator = null; + if (accumulators.size() == maxAccumulators) { Review Comment: Presumably this line is covered testing wise by your recent change that introduced testing for small blocks? ########## lucene/core/src/java/org/apache/lucene/index/CheckIndex.java: ########## @@ -3301,17 +3301,17 @@ private static void checkDocValueSkipper(FieldInfo fi, DocValuesSkipper skipper) if (skipper.maxDocID(0) == NO_MORE_DOCS) { break; } + if (skipper.minDocID(0) < doc) { + throw new CheckIndexException( + "skipper dv iterator for field: " + + fieldName + + " reports wrong minDocID, got " + + skipper.minDocID(0) + + " < " + + doc); + } int levels = skipper.numLevels(); for (int level = 0; level < levels; level++) { - if (skipper.minDocID(level) < doc) { - throw new CheckIndexException( - "skipper dv iterator for field: " - + fieldName - + " reports wrong minDocID, got " - + skipper.minDocID(level) - + " < " - + doc); - } Review Comment: This makes sense. ########## lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java: ########## @@ -1792,61 +1794,91 @@ public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { if (input.length() > 0) { input.prefetch(0, 1); } + // TODO: should we write to disk the actual max level for this segment? return new DocValuesSkipper() { - int minDocID = -1; - int maxDocID = -1; - long minValue, maxValue; - int docCount; + final int[] minDocID = new int[SKIP_INDEX_MAX_LEVEL]; + final int[] maxDocID = new int[SKIP_INDEX_MAX_LEVEL]; + + { + for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) { + minDocID[i] = maxDocID[i] = -1; + } + } + + final long[] minValue = new long[SKIP_INDEX_MAX_LEVEL]; + final long[] maxValue = new long[SKIP_INDEX_MAX_LEVEL]; + final int[] docCount = new int[SKIP_INDEX_MAX_LEVEL]; + int levels = 1; @Override public void advance(int target) throws IOException { if (target > entry.maxDocId) { - minDocID = DocIdSetIterator.NO_MORE_DOCS; - maxDocID = DocIdSetIterator.NO_MORE_DOCS; + // skipper is exhausted + for (int i = 0; i < SKIP_INDEX_MAX_LEVEL; i++) { + minDocID[i] = maxDocID[i] = DocIdSetIterator.NO_MORE_DOCS; + } } else { + // find next interval + assert target > maxDocID[0] : "target must be bigger that current interval"; while (true) { - maxDocID = input.readInt(); - if (maxDocID >= target) { - minDocID = input.readInt(); - maxValue = input.readLong(); - minValue = input.readLong(); - docCount = input.readInt(); + levels = input.readByte(); + assert levels <= SKIP_INDEX_MAX_LEVEL && levels > 0 + : "level out of range [" + levels + "]"; + boolean competitive = true; + // check if current interval is competitive or we can jump to the next position Review Comment: I got a bit confused at first by the use of `competitive`. Maybe `valid`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org