amogh-jahagirdar commented on code in PR #11463: URL: https://github.com/apache/iceberg/pull/11463#discussion_r1828450352
########## core/src/main/java/org/apache/iceberg/deletes/BitmapPositionDeleteIndex.java: ########## @@ -92,4 +107,113 @@ public Collection<DeleteFile> deleteFiles() { public long cardinality() { return bitmap.cardinality(); } + + /** + * Serializes the index using the following format: + * + * <ul> + * <li>The length of the magic bytes and bitmap stored as 4 bytes (big-endian). + * <li>A 4-byte {@link #MAGIC_NUMBER} (little-endian). + * <li>The bitmap serialized using the portable Roaring spec (little-endian). + * <li>A CRC-32 checksum of the magic bytes and bitmap as 4-bytes (big-endian). + * </ul> + * + * Note that the length and the checksum are computed for the bitmap data, which includes the + * magic bytes and bitmap for compatibility with Delta. + */ + @Override + public ByteBuffer serialize() { + bitmap.runLengthEncode(); // run-length encode the bitmap before serializing Review Comment: Should we more eagerly run length encode the bitmap in the constructor instead of at serialization time? ########## core/src/main/java/org/apache/iceberg/deletes/BitmapPositionDeleteIndex.java: ########## @@ -92,4 +107,113 @@ public Collection<DeleteFile> deleteFiles() { public long cardinality() { return bitmap.cardinality(); } + + /** + * Serializes the index using the following format: + * + * <ul> + * <li>The length of the magic bytes and bitmap stored as 4 bytes (big-endian). + * <li>A 4-byte {@link #MAGIC_NUMBER} (little-endian). + * <li>The bitmap serialized using the portable Roaring spec (little-endian). + * <li>A CRC-32 checksum of the magic bytes and bitmap as 4-bytes (big-endian). + * </ul> + * + * Note that the length and the checksum are computed for the bitmap data, which includes the + * magic bytes and bitmap for compatibility with Delta. + */ + @Override + public ByteBuffer serialize() { + bitmap.runLengthEncode(); // run-length encode the bitmap before serializing + int bitmapDataLength = computeBitmapDataLength(bitmap); // magic bytes + bitmap + byte[] bytes = new byte[LENGTH_SIZE_BYTES + bitmapDataLength + CRC_SIZE_BYTES]; + ByteBuffer buffer = ByteBuffer.wrap(bytes); + buffer.putInt(bitmapDataLength); + serializeBitmapData(bytes, bitmapDataLength, bitmap); + int crcOffset = LENGTH_SIZE_BYTES + bitmapDataLength; + int crc = computeChecksum(bytes, bitmapDataLength); + buffer.putInt(crcOffset, crc); + buffer.rewind(); + return buffer; + } + + /** + * Deserializes the index from bytes, assuming the format described in {@link #serialize()}. + * + * @param bytes an array containing the serialized index + * @param deleteFile the DV file + * @return the deserialized index + */ + public static PositionDeleteIndex deserialize(byte[] bytes, DeleteFile deleteFile) { + ByteBuffer buffer = ByteBuffer.wrap(bytes); + int bitmapDataLength = readBitmapDataLength(buffer, deleteFile); + RoaringPositionBitmap bitmap = deserializeBitmap(bytes, bitmapDataLength, deleteFile); + int crc = computeChecksum(bytes, bitmapDataLength); + int crcOffset = LENGTH_SIZE_BYTES + bitmapDataLength; + int expectedCrc = buffer.getInt(crcOffset); + Preconditions.checkArgument(crc == expectedCrc, "Invalid CRC"); + return new BitmapPositionDeleteIndex(bitmap, deleteFile); + } + + // computes and validates the length of the bitmap data (magic bytes + bitmap) + private static int computeBitmapDataLength(RoaringPositionBitmap bitmap) { + long length = MAGIC_NUMBER_SIZE_BYTES + bitmap.serializedSizeInBytes(); + long bufferSize = LENGTH_SIZE_BYTES + length + CRC_SIZE_BYTES; + Preconditions.checkState(bufferSize <= Integer.MAX_VALUE, "Can't serialize index > 2GB"); + return (int) length; + } + + // serializes the bitmap data (magic bytes + bitmap) using the little-endian byte order + private static void serializeBitmapData( + byte[] bytes, int bitmapDataLength, RoaringPositionBitmap bitmap) { + ByteBuffer bitmapData = pointToBitmapData(bytes, bitmapDataLength); + bitmapData.putInt(MAGIC_NUMBER); + bitmap.serialize(bitmapData); + } + + // points to the bitmap data in the blob + private static ByteBuffer pointToBitmapData(byte[] bytes, int bitmapDataLength) { + ByteBuffer bitmapData = ByteBuffer.wrap(bytes, BITMAP_DATA_OFFSET, bitmapDataLength); + bitmapData.order(ByteOrder.LITTLE_ENDIAN); + return bitmapData; + } + + // checks the blob size is equal to the bitmap data length + extra bytes for length and CRC + private static int readBitmapDataLength(ByteBuffer buffer, DeleteFile deleteFile) { + int length = buffer.getInt(); + long expectedLength = deleteFile.contentSizeInBytes() - LENGTH_SIZE_BYTES - CRC_SIZE_BYTES; + Preconditions.checkArgument( + length == expectedLength, + "Invalid bitmap data length: %s, expected %s", + length, + expectedLength); + return length; + } + + // validates magic bytes and deserializes the bitmap + private static RoaringPositionBitmap deserializeBitmap( + byte[] bytes, int bitmapDataLength, DeleteFile deleteFile) { + ByteBuffer bitmapData = pointToBitmapData(bytes, bitmapDataLength); + int magicNumber = bitmapData.getInt(); + Preconditions.checkArgument( + magicNumber == MAGIC_NUMBER, + "Invalid magic number: %s, expected %s", + magicNumber, + MAGIC_NUMBER); + RoaringPositionBitmap bitmap = RoaringPositionBitmap.deserialize(bitmapData); + long cardinality = bitmap.cardinality(); + long expectedCardinality = deleteFile.recordCount(); Review Comment: NIt: I feel like we could inline ` bitMap.cardinality() == deleteFile.recordCount()` in the check below and the intent of the check would still be clear ########## core/src/main/java/org/apache/iceberg/deletes/BitmapPositionDeleteIndex.java: ########## @@ -92,4 +107,113 @@ public Collection<DeleteFile> deleteFiles() { public long cardinality() { return bitmap.cardinality(); } + + /** + * Serializes the index using the following format: + * + * <ul> + * <li>The length of the magic bytes and bitmap stored as 4 bytes (big-endian). + * <li>A 4-byte {@link #MAGIC_NUMBER} (little-endian). + * <li>The bitmap serialized using the portable Roaring spec (little-endian). + * <li>A CRC-32 checksum of the magic bytes and bitmap as 4-bytes (big-endian). + * </ul> + * + * Note that the length and the checksum are computed for the bitmap data, which includes the + * magic bytes and bitmap for compatibility with Delta. + */ + @Override + public ByteBuffer serialize() { + bitmap.runLengthEncode(); // run-length encode the bitmap before serializing Review Comment: NVM, I don't think it'll make any difference and probably more readable the way it's written at the moment. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org