Jackie-Jiang commented on a change in pull request #5548: URL: https://github.com/apache/incubator-pinot/pull/5548#discussion_r439701135
########## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/PinotDataBitSetV2.java ########## @@ -632,98 +654,39 @@ public void readInt(long startIndex, int length, int[] out) { // handle spill over at 2-byte boundary to unpack 1 integer if (length > 0) { - out[i] = (int)_dataBuffer.getShort(byteOffset) & 0xffff; + out[i] = (int)_buffer.getShort(byteOffset) & 0xffff; } } } public static class RawInt extends PinotDataBitSetV2 { + // grab a final local reference to avoid + // the potential performance penalty that comes with super -> super + // as jvm sometimes doesn't inline the references across inheritance + private final PinotDataBuffer _buffer; RawInt(PinotDataBuffer dataBuffer, int numBits) { - _dataBuffer = dataBuffer; - _numBitsPerValue = numBits; + super(dataBuffer, numBits); + _buffer = dataBuffer; } @Override public int readInt(long index) { - return _dataBuffer.getInt(index * Integer.BYTES); + return _buffer.getInt(index * Integer.BYTES); } @Override public void readInt(long startIndex, int length, int[] out) { long byteOffset = startIndex * Integer.BYTES; for (int i = 0; i < length; i++) { - out[i] = _dataBuffer.getInt(byteOffset); + out[i] = _buffer.getInt(byteOffset); byteOffset += 4; } } } - protected void writeInt(int index, int value) { - long bitOffset = (long) index * _numBitsPerValue; - int byteOffset = (int) (bitOffset / Byte.SIZE); - int bitOffsetInFirstByte = (int) (bitOffset % Byte.SIZE); - - int firstByte = _dataBuffer.getByte(byteOffset); - - int firstByteMask = BYTE_MASK >>> bitOffsetInFirstByte; - int numBitsLeft = _numBitsPerValue - (Byte.SIZE - bitOffsetInFirstByte); - if (numBitsLeft <= 0) { - // The value is inside the first byte - firstByteMask &= BYTE_MASK << -numBitsLeft; - _dataBuffer.putByte(byteOffset, (byte) ((firstByte & ~firstByteMask) | (value << -numBitsLeft))); - } else { - // The value is in multiple bytes - _dataBuffer - .putByte(byteOffset, (byte) ((firstByte & ~firstByteMask) | ((value >>> numBitsLeft) & firstByteMask))); - while (numBitsLeft > Byte.SIZE) { - numBitsLeft -= Byte.SIZE; - _dataBuffer.putByte(++byteOffset, (byte) (value >> numBitsLeft)); - } - int lastByte = _dataBuffer.getByte(++byteOffset); - _dataBuffer.putByte(byteOffset, - (byte) ((lastByte & (BYTE_MASK >>> numBitsLeft)) | (value << (Byte.SIZE - numBitsLeft)))); - } - } - - public void writeInt(int startIndex, int length, int[] values) { - long startBitOffset = (long) startIndex * _numBitsPerValue; - int byteOffset = (int) (startBitOffset / Byte.SIZE); - int bitOffsetInFirstByte = (int) (startBitOffset % Byte.SIZE); - - int firstByte = _dataBuffer.getByte(byteOffset); - - for (int i = 0; i < length; i++) { - int value = values[i]; - if (bitOffsetInFirstByte == Byte.SIZE) { - bitOffsetInFirstByte = 0; - firstByte = _dataBuffer.getByte(++byteOffset); - } - int firstByteMask = BYTE_MASK >>> bitOffsetInFirstByte; - int numBitsLeft = _numBitsPerValue - (Byte.SIZE - bitOffsetInFirstByte); - if (numBitsLeft <= 0) { - // The value is inside the first byte - firstByteMask &= BYTE_MASK << -numBitsLeft; - firstByte = ((firstByte & ~firstByteMask) | (value << -numBitsLeft)); - _dataBuffer.putByte(byteOffset, (byte) firstByte); - bitOffsetInFirstByte = Byte.SIZE + numBitsLeft; - } else { - // The value is in multiple bytes - _dataBuffer - .putByte(byteOffset, (byte) ((firstByte & ~firstByteMask) | ((value >>> numBitsLeft) & firstByteMask))); - while (numBitsLeft > Byte.SIZE) { - numBitsLeft -= Byte.SIZE; - _dataBuffer.putByte(++byteOffset, (byte) (value >> numBitsLeft)); - } - int lastByte = _dataBuffer.getByte(++byteOffset); - firstByte = (lastByte & (0xFF >>> numBitsLeft)) | (value << (Byte.SIZE - numBitsLeft)); - _dataBuffer.putByte(byteOffset, (byte) firstByte); - bitOffsetInFirstByte = numBitsLeft; - } - } - } - @Override - public void close() - throws IOException { + public void close() { + // NOTE: DO NOT close the PinotDataBuffer here because it is tracked by the caller and might be reused later. The + // caller is responsible of closing the PinotDataBuffer. } } Review comment: (nit) new line ########## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/PinotBitSet.java ########## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import java.io.Closeable; + + +public interface PinotBitSet extends Closeable { + + /** + * Decode integers starting at a given index. + * @param index index + * @return unpacked integer + */ + int readInt(long index); + + /** + * Decode integers for a contiguous range of indexes represented by startIndex + * and length + * @param startIndex start docId + * @param length length + * @param out out array to store the unpacked integers + */ + void readInt(long startIndex, int length, int[] out); Review comment: (nit) Rename the third argument to `buffer`? ########## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/PinotDataBitSetV2.java ########## @@ -24,109 +24,58 @@ import org.apache.pinot.core.segment.memory.PinotDataBuffer; -public abstract class PinotDataBitSetV2 implements Closeable { - private static final int BYTE_MASK = 0xFF; - static final int MAX_VALUES_UNPACKED_SINGLE_ALIGNED_READ = 16; // comes from 2-bit encoding - - private static final ThreadLocal<int[]> THREAD_LOCAL_DICT_IDS = - ThreadLocal.withInitial(() -> new int[DocIdSetPlanNode.MAX_DOC_PER_CALL]); - - protected PinotDataBuffer _dataBuffer; - protected int _numBitsPerValue; - - /** - * Decode integers starting at a given index. This is efficient - * because of simplified bitmath. - * @param index docId - * @return unpacked integer - */ - public abstract int readInt(long index); - - /** - * Decode integers for a contiguous range of indexes represented by startIndex - * and length. This uses vectorization as much as possible for all the aligned - * reads and also takes care of the small byte-sized window of unaligned read. - * @param startIndex start docId - * @param length length - * @param out out array to store the unpacked integers - */ - public abstract void readInt(long startIndex, int length, int[] out); - - /** - * Decode integers for an array of indexes which is not necessarily - * contiguous. So there could be gaps in the array: - * e.g: [1, 3, 7, 9, 11, 12] - * The actual read is done by the previous API since that is efficient - * as it exploits contiguity and uses vectorization. However, since - * the out[] array has to be correctly populated with the unpacked integer - * for each index, a post-processing step is needed after the bulk contiguous - * read to correctly set the unpacked integer into the out array throwing away - * the unnecessary values decoded as part of contiguous read - * @param docIds index array - * @param docIdsStartIndex starting index in the docIds array - * @param length length to read (number of docIds to read in the array) - * @param out out array to store the decoded integers - * @param outpos starting index in the out array - */ - public void readInt(int[] docIds, int docIdsStartIndex, int length, int[] out, int outpos) { - int startDocId = docIds[docIdsStartIndex]; - int endDocId = docIds[docIdsStartIndex + length - 1]; - int[] dictIds = THREAD_LOCAL_DICT_IDS.get(); - // do a contiguous bulk read - readInt(startDocId, endDocId - startDocId + 1, dictIds); - out[outpos] = dictIds[0]; - // set the unpacked integer correctly. this is needed since there could - // be gaps and some decoded values may have to be thrown/ignored. - for (int i = 1; i < length; i++) { - out[outpos + i] = dictIds[docIds[docIdsStartIndex + i] - startDocId]; - } - } +public abstract class PinotDataBitSetV2 extends BasePinotBitSet implements PinotBitSet, Closeable { + static final int MAX_VALUES_UNPACKED_SINGLE_ALIGNED_READ = 32; - public static PinotDataBitSetV2 createBitSet(PinotDataBuffer pinotDataBuffer, int numBitsPerValue) { - switch (numBitsPerValue) { - case 1: - return new Bit1Encoded(pinotDataBuffer, numBitsPerValue); - case 2: - return new Bit2Encoded(pinotDataBuffer, numBitsPerValue); - case 4: - return new Bit4Encoded(pinotDataBuffer, numBitsPerValue); - case 8: - return new Bit8Encoded(pinotDataBuffer, numBitsPerValue); - case 16: - return new Bit16Encoded(pinotDataBuffer, numBitsPerValue); - case 32: - return new RawInt(pinotDataBuffer, numBitsPerValue); - default: - throw new UnsupportedOperationException(numBitsPerValue + "not supported by PinotDataBitSetV2"); - } + public PinotDataBitSetV2(PinotDataBuffer dataBuffer, int numBits) { + super(dataBuffer, numBits); } public static class Bit1Encoded extends PinotDataBitSetV2 { + // grab a final local reference to avoid Review comment: Do you observe performance degradation on this? I think hotspot can definitely handle this. Keeping an extra reference seems like over-optimization to me. ########## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/PinotBitSet.java ########## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import java.io.Closeable; + + +public interface PinotBitSet extends Closeable { + + /** + * Decode integers starting at a given index. + * @param index index + * @return unpacked integer + */ + int readInt(long index); + + /** + * Decode integers for a contiguous range of indexes represented by startIndex + * and length + * @param startIndex start docId + * @param length length + * @param out out array to store the unpacked integers + */ + void readInt(long startIndex, int length, int[] out); + + /** + * Encode integer starting at a given index. + * @param index index + * @param value integer to encode + */ + void writeInt(int index, int value); + + /** + * Encode integers for a contiguous range of indexes represented by startIndex + * and length + * @param startIndex start docId + * @param length length + * @param buffer array of integers to encode + */ + void writeInt(int startIndex, int length, int[] buffer); Review comment: (nit) Rename the third argument to `values` as it is used as the input? ########## File path: pinot-core/src/main/java/org/apache/pinot/core/io/reader/impl/v1/FixedBitSingleValueReader.java ########## @@ -20,15 +20,15 @@ import org.apache.pinot.core.io.reader.BaseSingleColumnSingleValueReader; import org.apache.pinot.core.io.reader.ReaderContext; -import org.apache.pinot.core.io.util.FixedBitIntReaderWriter; +import org.apache.pinot.core.io.util.FixedBitIntReaderWriterV2; import org.apache.pinot.core.segment.memory.PinotDataBuffer; public final class FixedBitSingleValueReader extends BaseSingleColumnSingleValueReader { - private final FixedBitIntReaderWriter _reader; + private final FixedBitIntReaderWriterV2 _reader; Review comment: Directly use `PinotBitSet`, and move the logic of bulk read in `FixedBitIntReaderWriterV2` to this class. ########## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/BasePinotBitSet.java ########## @@ -0,0 +1,164 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.io.util; + +import org.apache.pinot.core.segment.memory.PinotDataBuffer; + + +public abstract class BasePinotBitSet implements PinotBitSet { + private static final int[] NUM_BITS_SET = new int[1 << Byte.SIZE]; + private static final int[][] NTH_BIT_SET = new int[Byte.SIZE][1 << Byte.SIZE]; + private static final int[] FIRST_BIT_SET = NTH_BIT_SET[0]; + private static final int BYTE_MASK = 0xFF; + + protected PinotDataBuffer _dataBuffer; + protected int _numBitsPerValue; + + public BasePinotBitSet(PinotDataBuffer dataBuffer, int numBitsPerValue) { + _dataBuffer = dataBuffer; + _numBitsPerValue = numBitsPerValue; + } + + static { + for (int i = 0; i < (1 << Byte.SIZE); i++) { + int numBitsSet = 0; + for (int j = 0; j < Byte.SIZE; j++) { + if ((i & (0x80 >>> j)) != 0) { + NUM_BITS_SET[i]++; + NTH_BIT_SET[numBitsSet][i] = j; + numBitsSet++; + } + } + } + } + + @Override + public void writeInt(int index, int value) { + long bitOffset = (long) index * _numBitsPerValue; + int byteOffset = (int) (bitOffset / Byte.SIZE); + int bitOffsetInFirstByte = (int) (bitOffset % Byte.SIZE); + + int firstByte = _dataBuffer.getByte(byteOffset); + + int firstByteMask = BYTE_MASK >>> bitOffsetInFirstByte; + int numBitsLeft = _numBitsPerValue - (Byte.SIZE - bitOffsetInFirstByte); + if (numBitsLeft <= 0) { + // The value is inside the first byte + firstByteMask &= BYTE_MASK << -numBitsLeft; + _dataBuffer.putByte(byteOffset, (byte) ((firstByte & ~firstByteMask) | (value << -numBitsLeft))); + } else { + // The value is in multiple bytes + _dataBuffer + .putByte(byteOffset, (byte) ((firstByte & ~firstByteMask) | ((value >>> numBitsLeft) & firstByteMask))); + while (numBitsLeft > Byte.SIZE) { + numBitsLeft -= Byte.SIZE; + _dataBuffer.putByte(++byteOffset, (byte) (value >> numBitsLeft)); + } + int lastByte = _dataBuffer.getByte(++byteOffset); + _dataBuffer.putByte(byteOffset, + (byte) ((lastByte & (BYTE_MASK >>> numBitsLeft)) | (value << (Byte.SIZE - numBitsLeft)))); + } + } + + @Override + public void writeInt(int startIndex, int length, int[] values) { + long startBitOffset = (long) startIndex * _numBitsPerValue; + int byteOffset = (int) (startBitOffset / Byte.SIZE); + int bitOffsetInFirstByte = (int) (startBitOffset % Byte.SIZE); + + int firstByte = _dataBuffer.getByte(byteOffset); + + for (int i = 0; i < length; i++) { + int value = values[i]; + if (bitOffsetInFirstByte == Byte.SIZE) { + bitOffsetInFirstByte = 0; + firstByte = _dataBuffer.getByte(++byteOffset); + } + int firstByteMask = BYTE_MASK >>> bitOffsetInFirstByte; + int numBitsLeft = _numBitsPerValue - (Byte.SIZE - bitOffsetInFirstByte); + if (numBitsLeft <= 0) { + // The value is inside the first byte + firstByteMask &= BYTE_MASK << -numBitsLeft; + firstByte = ((firstByte & ~firstByteMask) | (value << -numBitsLeft)); + _dataBuffer.putByte(byteOffset, (byte) firstByte); + bitOffsetInFirstByte = Byte.SIZE + numBitsLeft; + } else { + // The value is in multiple bytes + _dataBuffer + .putByte(byteOffset, (byte) ((firstByte & ~firstByteMask) | ((value >>> numBitsLeft) & firstByteMask))); + while (numBitsLeft > Byte.SIZE) { + numBitsLeft -= Byte.SIZE; + _dataBuffer.putByte(++byteOffset, (byte) (value >> numBitsLeft)); + } + int lastByte = _dataBuffer.getByte(++byteOffset); + firstByte = (lastByte & (0xFF >>> numBitsLeft)) | (value << (Byte.SIZE - numBitsLeft)); + _dataBuffer.putByte(byteOffset, (byte) firstByte); + bitOffsetInFirstByte = numBitsLeft; + } + } + } + + // Helper functions used by multi-value reader and writer Review comment: Let's move these util methods into a separate util class. ########## File path: pinot-core/src/main/java/org/apache/pinot/core/io/reader/impl/v1/FixedBitMultiValueReader.java ########## @@ -49,8 +51,8 @@ private static final int PREFERRED_NUM_VALUES_PER_CHUNK = 2048; private final FixedByteValueReaderWriter _chunkOffsetReader; - private final PinotDataBitSet _bitmapReader; - private final FixedBitIntReaderWriter _rawDataReader; + private final PinotDataBuffer _headerBitmapBuffer; + private final FixedBitIntReaderWriterV2 _rawDataReader; Review comment: I would suggest removing `FixedBitIntReaderWriter` and `FixedBitIntReaderWriterV2` and directly use `PinotBitSet`. There is no value added to this extra layer. ########## File path: pinot-core/src/main/java/org/apache/pinot/core/io/util/PinotDataBitSetV2.java ########## @@ -632,98 +654,39 @@ public void readInt(long startIndex, int length, int[] out) { // handle spill over at 2-byte boundary to unpack 1 integer if (length > 0) { - out[i] = (int)_dataBuffer.getShort(byteOffset) & 0xffff; + out[i] = (int)_buffer.getShort(byteOffset) & 0xffff; } } } public static class RawInt extends PinotDataBitSetV2 { + // grab a final local reference to avoid + // the potential performance penalty that comes with super -> super + // as jvm sometimes doesn't inline the references across inheritance + private final PinotDataBuffer _buffer; RawInt(PinotDataBuffer dataBuffer, int numBits) { - _dataBuffer = dataBuffer; - _numBitsPerValue = numBits; + super(dataBuffer, numBits); + _buffer = dataBuffer; } @Override public int readInt(long index) { - return _dataBuffer.getInt(index * Integer.BYTES); + return _buffer.getInt(index * Integer.BYTES); } @Override public void readInt(long startIndex, int length, int[] out) { long byteOffset = startIndex * Integer.BYTES; for (int i = 0; i < length; i++) { - out[i] = _dataBuffer.getInt(byteOffset); + out[i] = _buffer.getInt(byteOffset); byteOffset += 4; } } } - protected void writeInt(int index, int value) { - long bitOffset = (long) index * _numBitsPerValue; - int byteOffset = (int) (bitOffset / Byte.SIZE); - int bitOffsetInFirstByte = (int) (bitOffset % Byte.SIZE); - - int firstByte = _dataBuffer.getByte(byteOffset); - - int firstByteMask = BYTE_MASK >>> bitOffsetInFirstByte; - int numBitsLeft = _numBitsPerValue - (Byte.SIZE - bitOffsetInFirstByte); - if (numBitsLeft <= 0) { - // The value is inside the first byte - firstByteMask &= BYTE_MASK << -numBitsLeft; - _dataBuffer.putByte(byteOffset, (byte) ((firstByte & ~firstByteMask) | (value << -numBitsLeft))); - } else { - // The value is in multiple bytes - _dataBuffer - .putByte(byteOffset, (byte) ((firstByte & ~firstByteMask) | ((value >>> numBitsLeft) & firstByteMask))); - while (numBitsLeft > Byte.SIZE) { - numBitsLeft -= Byte.SIZE; - _dataBuffer.putByte(++byteOffset, (byte) (value >> numBitsLeft)); - } - int lastByte = _dataBuffer.getByte(++byteOffset); - _dataBuffer.putByte(byteOffset, - (byte) ((lastByte & (BYTE_MASK >>> numBitsLeft)) | (value << (Byte.SIZE - numBitsLeft)))); - } - } - - public void writeInt(int startIndex, int length, int[] values) { - long startBitOffset = (long) startIndex * _numBitsPerValue; - int byteOffset = (int) (startBitOffset / Byte.SIZE); - int bitOffsetInFirstByte = (int) (startBitOffset % Byte.SIZE); - - int firstByte = _dataBuffer.getByte(byteOffset); - - for (int i = 0; i < length; i++) { - int value = values[i]; - if (bitOffsetInFirstByte == Byte.SIZE) { - bitOffsetInFirstByte = 0; - firstByte = _dataBuffer.getByte(++byteOffset); - } - int firstByteMask = BYTE_MASK >>> bitOffsetInFirstByte; - int numBitsLeft = _numBitsPerValue - (Byte.SIZE - bitOffsetInFirstByte); - if (numBitsLeft <= 0) { - // The value is inside the first byte - firstByteMask &= BYTE_MASK << -numBitsLeft; - firstByte = ((firstByte & ~firstByteMask) | (value << -numBitsLeft)); - _dataBuffer.putByte(byteOffset, (byte) firstByte); - bitOffsetInFirstByte = Byte.SIZE + numBitsLeft; - } else { - // The value is in multiple bytes - _dataBuffer - .putByte(byteOffset, (byte) ((firstByte & ~firstByteMask) | ((value >>> numBitsLeft) & firstByteMask))); - while (numBitsLeft > Byte.SIZE) { - numBitsLeft -= Byte.SIZE; - _dataBuffer.putByte(++byteOffset, (byte) (value >> numBitsLeft)); - } - int lastByte = _dataBuffer.getByte(++byteOffset); - firstByte = (lastByte & (0xFF >>> numBitsLeft)) | (value << (Byte.SIZE - numBitsLeft)); - _dataBuffer.putByte(byteOffset, (byte) firstByte); - bitOffsetInFirstByte = numBitsLeft; - } - } - } - @Override - public void close() - throws IOException { + public void close() { Review comment: Move this to `BasePinotBitSet`? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org