nastra commented on code in PR #11302:
URL: https://github.com/apache/iceberg/pull/11302#discussion_r1824171453


##########
core/src/main/java/org/apache/iceberg/deletes/BitmapPositionDeleteIndex.java:
##########
@@ -71,21 +85,128 @@ public void merge(PositionDeleteIndex that) {
 
   @Override
   public boolean isDeleted(long position) {
-    return roaring64Bitmap.contains(position);
+    return bitmap.contains(position);
   }
 
   @Override
   public boolean isEmpty() {
-    return roaring64Bitmap.isEmpty();
+    return bitmap.isEmpty();
   }
 
   @Override
   public void forEach(LongConsumer consumer) {
-    roaring64Bitmap.forEach(consumer::accept);
+    bitmap.forEach(consumer);
   }
 
   @Override
   public Collection<DeleteFile> deleteFiles() {
     return deleteFiles;
   }
+
+  @Override
+  public long cardinality() {
+    return bitmap.cardinality();
+  }
+
+  /**
+   * Serializes the index using the following format:
+   *
+   * <ul>
+   *   <li>The length of the bitmap and magic bytes stored as 4 bytes 
(big-endian).
+   *   <li>A 4-byte magic sequence (0xD1D33964) (little-endian).
+   *   <li>The bitmap serialized using the portable Roaring spec 
(little-endian).
+   *   <li>A CRC-32 checksum of the bitmap and magic bytes as 4-bytes 
(big-endian).
+   * </ul>
+   *
+   * Note that the length and the checksum are computed for the bitmap data, 
which includes the
+   * magic bytes and the serialized bitmap for compatibility with Delta Lake.
+   */
+  @Override
+  public ByteBuffer serialize() {
+    bitmap.runLengthEncode(); // run-length encode the bitmap before 
serializing
+    int bitmapDataLength = computeBitmapDataLength(bitmap); // magic bytes + 
bitmap
+    byte[] bytes = new byte[LENGTH_SIZE_BYTES + bitmapDataLength + 
CRC_SIZE_BYTES];
+    ByteBuffer buffer = ByteBuffer.wrap(bytes);
+    buffer.putInt(bitmapDataLength);
+    serializeBitmapData(bytes, bitmapDataLength, bitmap);
+    int crcOffset = LENGTH_SIZE_BYTES + bitmapDataLength;
+    int crc = computeChecksum(bytes, bitmapDataLength);
+    buffer.putInt(crcOffset, crc);
+    buffer.rewind();
+    return buffer;
+  }
+
+  public static PositionDeleteIndex deserialize(byte[] bytes, DeleteFile 
deleteFile) {
+    ByteBuffer buffer = ByteBuffer.wrap(bytes);
+    int bitmapDataLength = readBitmapDataLength(buffer, deleteFile);
+    RoaringPositionBitmap bitmap = deserializeBitmap(bytes, bitmapDataLength, 
deleteFile);
+    int crc = computeChecksum(bytes, bitmapDataLength);
+    int crcOffset = LENGTH_SIZE_BYTES + bitmapDataLength;
+    int expectedCrc = buffer.getInt(crcOffset);
+    Preconditions.checkArgument(crc == expectedCrc, "Invalid CRC");
+    return new BitmapPositionDeleteIndex(bitmap, deleteFile);
+  }
+
+  // computes and validates the length of the bitmap data (magic bytes + 
bitmap)
+  private static int computeBitmapDataLength(RoaringPositionBitmap bitmap) {
+    long length = MAGIC_NUMBER_SIZE_BYTES + bitmap.serializedSizeInBytes();
+    long bufferSize = LENGTH_SIZE_BYTES + length + CRC_SIZE_BYTES;
+    Preconditions.checkState(bufferSize <= Integer.MAX_VALUE, "Can't serialize 
index > 2GB");
+    return (int) length;
+  }
+
+  // serializes the bitmap data (magic bytes + bitmap) using the little-endian 
byte order
+  private static void serializeBitmapData(
+      byte[] bytes, int bitmapDataLength, RoaringPositionBitmap bitmap) {
+    ByteBuffer bitmapData = pointToBitmapData(bytes, bitmapDataLength);
+    bitmapData.putInt(MAGIC_NUMBER);
+    bitmap.serialize(bitmapData);
+  }
+
+  // points to the bitmap data in the blob
+  private static ByteBuffer pointToBitmapData(byte[] bytes, int 
bitmapDataLength) {
+    ByteBuffer bitmapData = ByteBuffer.wrap(bytes, BITMAP_DATA_OFFSET, 
bitmapDataLength);
+    bitmapData.order(ByteOrder.LITTLE_ENDIAN);
+    return bitmapData;
+  }
+
+  // checks the blob size is equal to the bitmap data length + extra bytes for 
length and CRC
+  private static int readBitmapDataLength(ByteBuffer buffer, DeleteFile 
deleteFile) {
+    int length = buffer.getInt();
+    long expectedLength = deleteFile.contentSizeInBytes() - LENGTH_SIZE_BYTES 
- CRC_SIZE_BYTES;
+    Preconditions.checkArgument(
+        length == expectedLength,
+        "Invalid bitmap data length %s, expected %s",

Review Comment:
   ```suggestion
           "Invalid bitmap data length: %s, expected %s",
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to