jpountz commented on code in PR #14133: URL: https://github.com/apache/lucene/pull/14133#discussion_r1915082635
########## lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java: ########## @@ -405,7 +422,34 @@ private void flushDocBlock(boolean finishTerm) throws IOException { } } long numSkipBytes = level0Output.size(); - forDeltaUtil.encodeDeltas(docDeltaBuffer, level0Output); + // Now we need to decide whether to encode block deltas as packed integers (FOR) or unary + // codes (bit set). FOR makes #nextDoc() a bit faster while the bit set approach makes + // #advance() sometimes faster and #intoBitSet() much faster. Since the trade-off is not + // obvious, we make the decision purely based on storage efficiency, using the approach that + // requires fewer bits to encode the block. + int bitsPerValue = forDeltaUtil.bitsRequired(docDeltaBuffer); + int sum = Math.toIntExact(Arrays.stream(docDeltaBuffer).sum()); + int numBitSetLongs = FixedBitSet.bits2words(sum); + if (sum == BLOCK_SIZE) { + level0Output.writeByte((byte) 0); + } else if (version < VERSION_DENSE_BLOCKS_AS_BITSETS || bitsPerValue * BLOCK_SIZE < sum) { + level0Output.writeByte((byte) bitsPerValue); + forDeltaUtil.encodeDeltas(bitsPerValue, docDeltaBuffer, level0Output); + } else { + // Storing doc deltas is more efficient using unary coding (ie. storing doc IDs as a bit + // set) + spareBitSet.clear(0, numBitSetLongs << 6); + int s = -1; + for (int i : docDeltaBuffer) { + s += i; + spareBitSet.set(s); + } + level0Output.writeByte((byte) -numBitSetLongs); Review Comment: Indeed, I added a comment. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org