JacobSMoller commented on PR #2252:
URL: https://github.com/apache/iceberg-python/pull/2252#issuecomment-3116994950
Small script for simulating the issue
```python
#!/usr/bin/env python3
"""
Simple reproduction of PyIceberg Snappy checksum bug
"""
import struct
import binascii
import snappy
def create_pyiceberg_snappy_block():
"""
Mimic PyIceberg's Snappy compression format with added checksum
"""
print("Creating PyIceberg-format Snappy block...")
# Original data that represents an Iceberg manifest entry
original_data = b'{"manifest_path": "gs://bucket/manifest.avro",
"manifest_length": 1234, "added_files_count": 5}'
# Compress with Snappy
compressed = snappy.compress(original_data)
# Calculate CRC32 checksum of the ORIGINAL data (this is what PyIceberg
does)
crc32 = binascii.crc32(original_data) & 0xFFFFFFFF
crc32_bytes = struct.pack(">I", crc32)
print(f"Orginal data checksum: {crc32:08x}")
# Create the block: compressed data + checksum
snappy_block = compressed + crc32_bytes
return snappy_block
def demonstrate_pyiceberg_bug(data):
"""
Issue with extracting the checksum after truncating the data
"""
print("Demonstrating the fix for the PyIceberg Snappy checksum bug")
print("=" * 70)
# Step 1: Truncate BEFORE getting checksum (BUG!)
data = data[0:-4]
# Step 2: Decompress
uncompressed = snappy.decompress(data)
print(f"Uncompressed: {uncompressed}")
# Step 3: Get checksum from truncated data (BUG!)
checksum = data[-4:]
print(f"Checksum: {checksum.hex()} ({checksum})")
# Step 4: CRC32 validation fails
computed_crc = binascii.crc32(uncompressed) & 0xFFFFFFFF
buggy_stored_crc = struct.unpack(">I", checksum)[0]
print(f"Computed crc32: {binascii.crc32(uncompressed) & 0xFFFFFFFF:08x}")
print(f"Stored crc32: {struct.unpack(">I", checksum)[0]:08x}")
print(f"Match: {computed_crc == buggy_stored_crc}")
def demonstrate_pyiceberg_fix(data):
"""
Demonstrate the fix for the PyIceberg Snappy checksum bug
"""
print()
print("Demonstrating the fix for the PyIceberg Snappy checksum bug")
print("=" * 70)
checksum = data[-4:]
data = data[0:-4]
uncompressed = snappy.decompress(data)
computed_crc = binascii.crc32(uncompressed) & 0xFFFFFFFF
stored_crc = struct.unpack(">I", checksum)[0]
print(f"Checksum: {checksum.hex()} ({checksum})")
print(f"Uncompressed: {uncompressed}")
print(f"Computed crc32: {computed_crc:08x}")
print(f"Stored crc32: {stored_crc:08x}")
print(f"Match: {computed_crc == stored_crc}")
def main():
# Create a realistic Snappy block
snappy_block = create_pyiceberg_snappy_block()
# Demonstrate the bug and the fix
demonstrate_pyiceberg_bug(snappy_block)
demonstrate_pyiceberg_fix(snappy_block)
if __name__ == "__main__":
main()
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]