Fokko commented on code in PR #8084:
URL: https://github.com/apache/iceberg/pull/8084#discussion_r1269039875
##########
python/pyiceberg/avro/reader.py:
##########
@@ -321,51 +336,84 @@ def __repr__(self) -> str:
def __hash__(self) -> int:
"""Returns a hashed representation of the StructReader class."""
- return hash(self.field_readers)
+ return self._hash
-@dataclass(frozen=True)
+@dataclass(frozen=False, init=False)
class ListReader(Reader):
- __slots__ = ("element",)
+ __slots__ = ("element", "is_int_list", "_hash")
Review Comment:
```suggestion
__slots__ = ("element", "_is_int_list", "_hash")
```
##########
python/pyiceberg/avro/reader.py:
##########
@@ -267,10 +267,11 @@ def skip(self, decoder: BinaryDecoder) -> None:
class StructReader(Reader):
- __slots__ = ("field_readers", "create_struct", "struct")
+ __slots__ = ("field_readers", "create_struct", "struct",
"create_with_keyword", "_field_reader_functions", "_hash")
Review Comment:
Maybe we want to throw in some tests here?
```python
def test_read_struct() -> None:
mis = io.BytesIO(b"\x18")
decoder = StreamingBinaryDecoder(mis)
struct = StructType(NestedField(1, "id", IntegerType(), required=True))
reader = StructReader(((0, IntegerReader()),), Record, struct)
result = reader.read(decoder)
assert repr(result) == "Record[id=12]"
assert len(reader.__dict__) == 0
```
This way we make sure that the `__dict__` stays empty, and we take advantage
of slots.
##########
python/pyiceberg/avro/file.py:
##########
@@ -173,7 +173,7 @@ def __enter__(self) -> AvroFile[D]:
A generator returning the AvroStructs.
"""
self.input_stream = self.input_file.open(seekable=False)
- self.decoder = BinaryDecoder(self.input_stream)
+ self.decoder = StreamingBinaryDecoder(self.input_stream)
Review Comment:
Just to echo my thoughts here. Ideally, we want to read the whole file at
once, instead of streaming through it, and potentially doing multiple calls to
S3. Maybe we want to increase the buffer size? PyArrow supports `.read(None)`
to read the whole file. Idk and probably needs some more investigation.
##########
python/tests/avro/test_reader.py:
##########
@@ -20,7 +20,7 @@
import pytest
-from pyiceberg.avro.decoder import BinaryDecoder
+from pyiceberg.avro.decoder import StreamingBinaryDecoder
Review Comment:
Should we also loop over `AVAILABLE_DECODERS` here?
##########
python/pyiceberg/avro/reader.py:
##########
@@ -321,51 +336,84 @@ def __repr__(self) -> str:
def __hash__(self) -> int:
"""Returns a hashed representation of the StructReader class."""
- return hash(self.field_readers)
+ return self._hash
-@dataclass(frozen=True)
+@dataclass(frozen=False, init=False)
class ListReader(Reader):
- __slots__ = ("element",)
+ __slots__ = ("element", "is_int_list", "_hash")
element: Reader
+ def __init__(self, element: Reader) -> None:
+ super().__init__()
+ self.element = element
+ self._hash = hash(self.element)
+ self._is_int_list = isinstance(self.element, IntegerReader)
+
def read(self, decoder: BinaryDecoder) -> List[Any]:
- read_items = []
+ read_items: List[Any] = []
block_count = decoder.read_int()
while block_count != 0:
if block_count < 0:
block_count = -block_count
_ = decoder.read_int()
- for _ in range(block_count):
- read_items.append(self.element.read(decoder))
+ if self._is_int_list:
+ decoder.read_ints(block_count, read_items)
+ else:
+ for _ in range(block_count):
+ read_items.append(self.element.read(decoder))
block_count = decoder.read_int()
return read_items
def skip(self, decoder: BinaryDecoder) -> None:
_skip_map_array(decoder, lambda: self.element.skip(decoder))
+ def __hash__(self) -> int:
+ """Returns a hashed representation of the ListReader class."""
+ return self._hash
-@dataclass(frozen=True)
+
+@dataclass(frozen=False, init=False)
class MapReader(Reader):
- __slots__ = ("key", "value")
+ __slots__ = ("key", "value", "_is_int_int", "_is_int_bytes",
"_key_reader", "_value_reader", "_hash")
key: Reader
value: Reader
+ def __init__(self, key: Reader, value: Reader) -> None:
+ super().__init__()
+ self.key = key
+ self.value = value
+ self._is_int_int = isinstance(self.key, IntegerReader) and
isinstance(self.value, IntegerReader)
+ self._is_int_bytes = isinstance(self.key, IntegerReader) and
isinstance(self.value, BinaryReader)
+ self._key_reader = self.key.read
Review Comment:
Nit: I think we can also move these below:
```python
if self._is_int_int or self._is_int_bytes:
....
else:
_key_reader = self.key.read
_value_reader = self.value.read
```
They are only used when we don't use specialized readers.
##########
python/pyiceberg/avro/reader.py:
##########
@@ -267,10 +267,11 @@ def skip(self, decoder: BinaryDecoder) -> None:
class StructReader(Reader):
- __slots__ = ("field_readers", "create_struct", "struct")
+ __slots__ = ("field_readers", "create_struct", "struct",
"create_with_keyword", "_field_reader_functions", "_hash")
Review Comment:
```suggestion
__slots__ = ("field_readers", "create_struct", "struct",
"_create_with_keyword", "_field_reader_functions", "_hash")
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]