Fokko commented on code in PR #8622: URL: https://github.com/apache/iceberg/pull/8622#discussion_r1335530122
########## python/pyiceberg/manifest.py: ########## @@ -79,88 +95,107 @@ def __repr__(self) -> str: return f"FileFormat.{self.name}" -DATA_FILE_TYPE = StructType( - NestedField( - field_id=134, - name="content", - field_type=IntegerType(), - required=False, - doc="Contents of the file: 0=data, 1=position deletes, 2=equality deletes", - initial_default=DataFileContent.DATA, - ), - NestedField(field_id=100, name="file_path", field_type=StringType(), required=True, doc="Location URI with FS scheme"), - NestedField( - field_id=101, name="file_format", field_type=StringType(), required=True, doc="File format name: avro, orc, or parquet" - ), - NestedField( - field_id=102, - name="partition", - field_type=StructType(), - required=True, - doc="Partition data tuple, schema based on the partition spec", - ), - NestedField(field_id=103, name="record_count", field_type=LongType(), required=True, doc="Number of records in the file"), - NestedField(field_id=104, name="file_size_in_bytes", field_type=LongType(), required=True, doc="Total file size in bytes"), - NestedField( - field_id=108, - name="column_sizes", - field_type=MapType(key_id=117, key_type=IntegerType(), value_id=118, value_type=LongType()), - required=False, - doc="Map of column id to total size on disk", - ), - NestedField( - field_id=109, - name="value_counts", - field_type=MapType(key_id=119, key_type=IntegerType(), value_id=120, value_type=LongType()), - required=False, - doc="Map of column id to total count, including null and NaN", - ), - NestedField( - field_id=110, - name="null_value_counts", - field_type=MapType(key_id=121, key_type=IntegerType(), value_id=122, value_type=LongType()), - required=False, - doc="Map of column id to null value count", - ), - NestedField( - field_id=137, - name="nan_value_counts", - field_type=MapType(key_id=138, key_type=IntegerType(), value_id=139, value_type=LongType()), - required=False, - doc="Map of column id to number of NaN values in the column", - ), - NestedField( - field_id=125, - name="lower_bounds", - field_type=MapType(key_id=126, key_type=IntegerType(), value_id=127, value_type=BinaryType()), - required=False, - doc="Map of column id to lower bound", - ), - NestedField( - field_id=128, - name="upper_bounds", - field_type=MapType(key_id=129, key_type=IntegerType(), value_id=130, value_type=BinaryType()), - required=False, - doc="Map of column id to upper bound", - ), - NestedField(field_id=131, name="key_metadata", field_type=BinaryType(), required=False, doc="Encryption key metadata blob"), - NestedField( - field_id=132, - name="split_offsets", - field_type=ListType(element_id=133, element_type=LongType(), element_required=True), - required=False, - doc="Splittable offsets", - ), - NestedField( - field_id=135, - name="equality_ids", - field_type=ListType(element_id=136, element_type=LongType(), element_required=True), - required=False, - doc="Equality comparison field IDs", - ), - NestedField(field_id=140, name="sort_order_id", field_type=IntegerType(), required=False, doc="Sort order ID"), - NestedField(field_id=141, name="spec_id", field_type=IntegerType(), required=False, doc="Partition spec ID"), -) +def data_file_type(partition_type: StructType) -> StructType: Review Comment: What do you think of the following approach: ```python DATA_FILE_TYPE = StructType( NestedField( field_id=134, name="content", field_type=IntegerType(), required=False, doc="Contents of the file: 0=data, 1=position deletes, 2=equality deletes", initial_default=DataFileContent.DATA, ), ... NestedField( field_id=102, name="partition", field_type=StructType(), required=True, doc="Partition data tuple, schema based on the partition spec", ), ... ) def datafile_with_partition(partition_type: StructType) -> StructType() return StructType( *[NestedField( field_id=102, name="partition", field_type=partition_type, required=True, doc="Partition data tuple, schema based on the partition spec", ) if field.field_id = 102 else field for field in DATA_FILE_TYPE.fields] ) ``` This keeps the old `DATA_FILE_TYPE` in place, and allows the ability to create new ones with any partition spec, and we don't recreate all the fields all the time. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org