rdblue commented on code in PR #6490: URL: https://github.com/apache/iceberg/pull/6490#discussion_r1059522065
########## python/pyiceberg/manifest.py: ########## @@ -76,137 +66,283 @@ def __repr__(self) -> str: return f"FileFormat.{self.name}" -class DataFile(IcebergBaseModel): - content: DataFileContent = Field(default=DataFileContent.DATA) - file_path: str = Field() - file_format: FileFormat = Field() - partition: Dict[str, Any] = Field() - record_count: int = Field() - file_size_in_bytes: int = Field() - block_size_in_bytes: Optional[int] = Field() - column_sizes: Optional[Dict[int, int]] = Field() - value_counts: Optional[Dict[int, int]] = Field() - null_value_counts: Optional[Dict[int, int]] = Field() - nan_value_counts: Optional[Dict[int, int]] = Field() - distinct_counts: Optional[Dict[int, int]] = Field() - lower_bounds: Optional[Dict[int, bytes]] = Field() - upper_bounds: Optional[Dict[int, bytes]] = Field() - key_metadata: Optional[bytes] = Field() - split_offsets: Optional[List[int]] = Field() - equality_ids: Optional[List[int]] = Field() - sort_order_id: Optional[int] = Field() - - -class ManifestEntry(IcebergBaseModel): - status: ManifestEntryStatus = Field() - snapshot_id: Optional[int] = Field() - sequence_number: Optional[int] = Field() - data_file: DataFile = Field() - - -class PartitionFieldSummary(IcebergBaseModel): - contains_null: bool = Field() - contains_nan: Optional[bool] = Field() - lower_bound: Optional[bytes] = Field() - upper_bound: Optional[bytes] = Field() - - -class ManifestFile(IcebergBaseModel): - manifest_path: str = Field() - manifest_length: int = Field() - partition_spec_id: int = Field() - content: ManifestContent = Field(default=ManifestContent.DATA) - sequence_number: int = Field(default=0) - min_sequence_number: int = Field(default=0) - added_snapshot_id: Optional[int] = Field() - added_data_files_count: Optional[int] = Field() - existing_data_files_count: Optional[int] = Field() - deleted_data_files_count: Optional[int] = Field() - added_rows_count: Optional[int] = Field() - existing_rows_counts: Optional[int] = Field() - deleted_rows_count: Optional[int] = Field() - partitions: Optional[List[PartitionFieldSummary]] = Field() - key_metadata: Optional[bytes] = Field() - - def fetch_manifest_entry(self, io: FileIO) -> List[ManifestEntry]: +class DataFile(Record): + @staticmethod + def from_record(record: Record, format_version: int) -> Union[DataFileV1, DataFileV2]: + if format_version == 1: + return DataFileV1(*record) + elif format_version == 2: + return DataFileV2(*record) + else: + raise ValueError(f"Unknown format-version: {format_version}") + + file_path: str + file_format: FileFormat + partition: Record + record_count: int + file_size_in_bytes: int + block_size_in_bytes: Optional[int] = None + column_sizes: Optional[Dict[int, int]] = None + value_counts: Optional[Dict[int, int]] = None + null_value_counts: Optional[Dict[int, int]] = None + nan_value_counts: Optional[Dict[int, int]] = None + distinct_counts: Optional[Dict[int, int]] = None # Does not seem to be used on the Java side!? + lower_bounds: Optional[Dict[int, bytes]] = None + upper_bounds: Optional[Dict[int, bytes]] = None + key_metadata: Optional[bytes] = None + split_offsets: Optional[List[int]] = None + equality_ids: Optional[List[int]] = None + sort_order_id: Optional[int] = None + content: DataFileContent = DataFileContent.DATA + + +class DataFileV1(DataFile): + def __setitem__(self, pos: int, value: Any) -> None: Review Comment: I think that these methods need to be dynamic based on the schema that is used to read the file. That's why we have a mapping from the canonical (full schema) positions and the positions from the expected schema: https://github.com/apache/iceberg/blob/master/core/src/main/java/org/apache/iceberg/BaseFile.java#L240-L242 That would also make it so you can have just one `DataFile` class because v1 is just a projection of v2. There are a few couple of version-specific classes in Java ([V1Metadata](https://github.com/apache/iceberg/blob/master/core/src/main/java/org/apache/iceberg/V1Metadata.java) and [V2Metadata](https://github.com/apache/iceberg/blob/master/core/src/main/java/org/apache/iceberg/V2Metadata.java)), but those are used only for writing, not for reading. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org