JacobSMoller commented on PR #2252:
URL: https://github.com/apache/iceberg-python/pull/2252#issuecomment-3116994950

   Small script for simulating the issue 
   ```python
   #!/usr/bin/env python3
   """
   Simple reproduction of PyIceberg Snappy checksum bug
   """
   
   import struct
   import binascii
   import snappy
   def create_pyiceberg_snappy_block():
       """
       Mimic PyIceberg's Snappy compression format with added checksum
       """
       print("Creating PyIceberg-format Snappy block...")
       
       # Original data that represents an Iceberg manifest entry
       original_data = b'{"manifest_path": "gs://bucket/manifest.avro", 
"manifest_length": 1234, "added_files_count": 5}'
       
       # Compress with Snappy
       compressed = snappy.compress(original_data)
       
       # Calculate CRC32 checksum of the ORIGINAL data (this is what PyIceberg 
does)
       crc32 = binascii.crc32(original_data) & 0xFFFFFFFF
       crc32_bytes = struct.pack(">I", crc32)
   
       print(f"Orginal data checksum: {crc32:08x}")
       
       # Create the block: compressed data + checksum
       snappy_block = compressed + crc32_bytes
       
       return snappy_block
   
   def demonstrate_pyiceberg_bug(data):
       """
       Issue with extracting the checksum after truncating the data
       """
       print("Demonstrating the fix for the PyIceberg Snappy checksum bug")
       print("=" * 70)
       # Step 1: Truncate BEFORE getting checksum (BUG!)
       data = data[0:-4]
       
       # Step 2: Decompress
       uncompressed = snappy.decompress(data)
       print(f"Uncompressed: {uncompressed}")
       
       # Step 3: Get checksum from truncated data (BUG!)
       checksum = data[-4:]
       print(f"Checksum: {checksum.hex()} ({checksum})")
       
       # Step 4: CRC32 validation fails
       computed_crc = binascii.crc32(uncompressed) & 0xFFFFFFFF
       buggy_stored_crc = struct.unpack(">I", checksum)[0]
       print(f"Computed crc32: {binascii.crc32(uncompressed) & 0xFFFFFFFF:08x}")
       print(f"Stored crc32: {struct.unpack(">I", checksum)[0]:08x}")
       print(f"Match: {computed_crc == buggy_stored_crc}")
       
       
       
   def demonstrate_pyiceberg_fix(data):
       """
       Demonstrate the fix for the PyIceberg Snappy checksum bug
       """
       print()
       print("Demonstrating the fix for the PyIceberg Snappy checksum bug")
       print("=" * 70)
       checksum = data[-4:]
       data = data[0:-4]
       uncompressed = snappy.decompress(data)
       computed_crc = binascii.crc32(uncompressed) & 0xFFFFFFFF
       stored_crc = struct.unpack(">I", checksum)[0]
       print(f"Checksum: {checksum.hex()} ({checksum})")
       print(f"Uncompressed: {uncompressed}")
       print(f"Computed crc32: {computed_crc:08x}")
       print(f"Stored crc32: {stored_crc:08x}")
       print(f"Match: {computed_crc == stored_crc}")
   
   def main():
       # Create a realistic Snappy block
       snappy_block = create_pyiceberg_snappy_block()
       
       # Demonstrate the bug and the fix
       demonstrate_pyiceberg_bug(snappy_block)
       demonstrate_pyiceberg_fix(snappy_block)
   if __name__ == "__main__":
       main() 
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to