Fokko commented on code in PR #1443: URL: https://github.com/apache/iceberg-python/pull/1443#discussion_r1899070927
########## pyiceberg/io/pyarrow.py: ########## @@ -1237,16 +1257,20 @@ def _task_to_record_batches( # When V3 support is introduced, we will update `downcast_ns_timestamp_to_us` flag based on # the table format version. file_schema = pyarrow_to_schema(physical_schema, name_mapping, downcast_ns_timestamp_to_us=True) + pyarrow_filter = None if bound_row_filter is not AlwaysTrue(): translated_row_filter = translate_column_names(bound_row_filter, file_schema, case_sensitive=case_sensitive) bound_file_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive) pyarrow_filter = expression_to_pyarrow(bound_file_filter) + # Apply column projection rules for missing partitions and default values + # https://iceberg.apache.org/spec/#column-projection file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False) - if file_schema is None: - raise ValueError(f"Missing Iceberg schema in Metadata for file: {path}") + projected_missing_fields = _get_column_projection_values( + task.file, projected_schema, projected_field_ids, file_project_schema, partition_spec + ) Review Comment: ```suggestion projected_missing_fields = _get_column_projection_values( task.file, projected_schema, projected_field_ids, file_project_schema, partition_spec ) if partition_spec is not None else {} ``` ########## pyiceberg/io/pyarrow.py: ########## @@ -1216,6 +1216,25 @@ def _field_id(self, field: pa.Field) -> int: return -1 +def _get_column_projection_values( + file: DataFile, + projected_schema: Schema, + projected_field_ids: Set[int], + file_project_schema: Schema, + partition_spec: Optional[PartitionSpec] = None, +) -> Dict[str, object]: + """Apply Column Projection rules to File Schema.""" + projected_missing_fields = {} + + for field_id in projected_field_ids.difference(file_project_schema.field_ids): + if partition_spec is not None: Review Comment: This is a pretty hot path in the execution, I'd rather move out this check as suggested below. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org