kevinjqliu commented on code in PR #1066: URL: https://github.com/apache/iceberg-python/pull/1066#discussion_r1740141923
########## mkdocs/docs/api.md: ########## @@ -844,6 +844,19 @@ readable_metrics: [ [6.0989]] ``` +<!-- prettier-ignore-start --> Review Comment: with the new markdown linter (#1118), this might not be necessary anymore ########## pyiceberg/table/__init__.py: ########## @@ -4308,7 +4308,9 @@ def history(self) -> "pa.Table": return pa.Table.from_pylist(history, schema=history_schema) - def files(self, snapshot_id: Optional[int] = None) -> "pa.Table": + def _files( + self, snapshot_id: Optional[int] = None, data_file_content_types: Optional[List[DataFileContent]] = None Review Comment: nit: `data_file_filter` and also use Set instead of List ########## pyiceberg/table/__init__.py: ########## @@ -4393,12 +4397,12 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: "spec_id": data_file.spec_id, "record_count": data_file.record_count, "file_size_in_bytes": data_file.file_size_in_bytes, - "column_sizes": dict(data_file.column_sizes), - "value_counts": dict(data_file.value_counts), - "null_value_counts": dict(data_file.null_value_counts), - "nan_value_counts": dict(data_file.nan_value_counts), - "lower_bounds": dict(data_file.lower_bounds), - "upper_bounds": dict(data_file.upper_bounds), + "column_sizes": dict(data_file.column_sizes) if data_file.column_sizes is not None else None, Review Comment: nit: push this logic to the line above ``` column_sizes = data_file.column_sizes or {} ``` ########## tests/integration/test_inspect_table.py: ########## @@ -672,126 +672,141 @@ def test_inspect_files( # append more data tbl.append(arrow_table_with_null) - df = tbl.refresh().inspect.files() + # configure table properties + if format_version == 2: + with tbl.transaction() as txn: + txn.set_properties({"write.delete.mode": "merge-on-read"}) + spark.sql(f"DELETE FROM {identifier} WHERE int = 1") Review Comment: is this to produce delete files? ########## pyiceberg/table/__init__.py: ########## @@ -4393,12 +4397,12 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: "spec_id": data_file.spec_id, "record_count": data_file.record_count, "file_size_in_bytes": data_file.file_size_in_bytes, - "column_sizes": dict(data_file.column_sizes), - "value_counts": dict(data_file.value_counts), - "null_value_counts": dict(data_file.null_value_counts), - "nan_value_counts": dict(data_file.nan_value_counts), - "lower_bounds": dict(data_file.lower_bounds), - "upper_bounds": dict(data_file.upper_bounds), + "column_sizes": dict(data_file.column_sizes) if data_file.column_sizes is not None else None, Review Comment: also i wonder if the null/None behavior is due to the `.toPandas()` conversion when queried through Spark -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org