rdblue commented on code in PR #6525:
URL: https://github.com/apache/iceberg/pull/6525#discussion_r1064184632


##########
python/pyiceberg/manifest.py:
##########
@@ -97,48 +178,82 @@ class DataFile(IcebergBaseModel):
     sort_order_id: Optional[int] = Field()
 
 
-class ManifestEntry(IcebergBaseModel):
+MANIFEST_ENTRY_SCHEMA = Schema(
+    NestedField(0, "status", IntegerType(), required=True),
+    NestedField(1, "snapshot_id", LongType(), required=False),
+    NestedField(3, "sequence_number", LongType(), required=False),
+    NestedField(4, "file_sequence_number", LongType(), required=False),
+    NestedField(2, "data_file", DATA_FILE, required=False),
+)
+
+
+class ManifestEntry(PydanticStruct):
     status: ManifestEntryStatus = Field()
     snapshot_id: Optional[int] = Field()
     sequence_number: Optional[int] = Field()
+    file_sequence_number: Optional[int] = Field()
     data_file: DataFile = Field()
 
 
-class PartitionFieldSummary(IcebergBaseModel):
+PARTITION_FIELD_SUMMARY_TYPE = StructType(
+    NestedField(509, "contains_null", BooleanType(), required=True),
+    NestedField(518, "contains_nan", BooleanType(), required=False),
+    NestedField(510, "lower_bound", BinaryType(), required=False),
+    NestedField(511, "upper_bound", BinaryType(), required=False),
+)
+
+
+class PartitionFieldSummary(PydanticStruct):
     contains_null: bool = Field()
     contains_nan: Optional[bool] = Field()
     lower_bound: Optional[bytes] = Field()
     upper_bound: Optional[bytes] = Field()
 
 
-class ManifestFile(IcebergBaseModel):
+MANIFEST_FILE_SCHEMA: Schema = Schema(
+    NestedField(500, "manifest_path", StringType(), required=True, 
doc="Location URI with FS scheme"),
+    NestedField(501, "manifest_length", LongType(), required=True),
+    NestedField(502, "partition_spec_id", IntegerType(), required=True),
+    NestedField(517, "content", IntegerType(), required=False),
+    NestedField(515, "sequence_number", LongType(), required=False),
+    NestedField(516, "min_sequence_number", LongType(), required=False),
+    NestedField(503, "added_snapshot_id", LongType(), required=False),
+    NestedField(504, "added_files_count", IntegerType(), required=False),
+    NestedField(505, "existing_files_count", IntegerType(), required=False),
+    NestedField(506, "deleted_files_count", IntegerType(), required=False),
+    NestedField(512, "added_rows_count", LongType(), required=False),
+    NestedField(513, "existing_rows_count", LongType(), required=False),
+    NestedField(514, "deleted_rows_count", LongType(), required=False),
+    NestedField(507, "partitions", ListType(508, PARTITION_FIELD_SUMMARY_TYPE, 
element_required=True), required=False),
+    NestedField(519, "key_metadata", BinaryType(), required=False),
+)
+
+
+class ManifestFile(PydanticStruct):
     manifest_path: str = Field()
     manifest_length: int = Field()
     partition_spec_id: int = Field()
     content: ManifestContent = Field(default=ManifestContent.DATA)
-    sequence_number: int = Field(default=0)
-    min_sequence_number: int = Field(default=0)
+    sequence_number: Optional[int] = Field()
+    min_sequence_number: Optional[int] = Field()
     added_snapshot_id: Optional[int] = Field()
-    added_data_files_count: Optional[int] = Field()
-    existing_data_files_count: Optional[int] = Field()
-    deleted_data_files_count: Optional[int] = Field()
+    added_files_count: Optional[int] = Field()
+    existing_files_count: Optional[int] = Field()
+    deleted_files_count: Optional[int] = Field()
     added_rows_count: Optional[int] = Field()
-    existing_rows_counts: Optional[int] = Field()
+    existing_rows_count: Optional[int] = Field()
     deleted_rows_count: Optional[int] = Field()
-    partitions: Optional[List[PartitionFieldSummary]] = Field()
+    partitions: List[PartitionFieldSummary] = Field(default_factory=list)
     key_metadata: Optional[bytes] = Field()
 
     def fetch_manifest_entry(self, io: FileIO) -> List[ManifestEntry]:
         file = io.new_input(self.manifest_path)
         return list(read_manifest_entry(file))
 
 
-def read_manifest_entry(input_file: InputFile) -> Iterator[ManifestEntry]:
-    with AvroFile(input_file) as reader:
-        schema = reader.schema
-        for record in reader:
-            dict_repr = _convert_pos_to_dict(schema, record)
-            yield ManifestEntry(**dict_repr)
+def read_manifest_entry(input_file: InputFile) -> List[ManifestEntry]:
+    with AvroFile[ManifestEntry](input_file, MANIFEST_ENTRY_SCHEMA, {-1: 
ManifestEntry, 2: DataFile, 102: Record}) as reader:
+        return list(reader)

Review Comment:
   I think these should continue to use generators so that reusing struct 
instances is easy. Why change to list?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to