Re: [PR] support all_entries in pyiceberg [iceberg-python]

via GitHub Mon, 17 Feb 2025 12:32:41 -0800


amitgilad3 commented on code in PR #1608:
URL: https://github.com/apache/iceberg-python/pull/1608#discussion_r1958750651



##########
pyiceberg/table/inspect.py:
##########
@@ -157,74 +158,96 @@ def _readable_metrics_struct(bound_type: PrimitiveType) 
-> pa.StructType:
                 pa.field("readable_metrics", 
pa.struct(readable_metrics_struct), nullable=True),
             ]
         )
+        return entries_schema
+
+    def _get_entries(self, schema: "pa.Schema", manifest: ManifestFile, 
discard_deleted: bool = True) -> "pa.Table":
+        import pyarrow as pa
 
+        entries_schema = self._get_entries_schema()
         entries = []
-        snapshot = self._get_snapshot(snapshot_id)
-        for manifest in snapshot.manifests(self.tbl.io):
-            for entry in manifest.fetch_manifest_entry(io=self.tbl.io):
-                column_sizes = entry.data_file.column_sizes or {}
-                value_counts = entry.data_file.value_counts or {}
-                null_value_counts = entry.data_file.null_value_counts or {}
-                nan_value_counts = entry.data_file.nan_value_counts or {}
-                lower_bounds = entry.data_file.lower_bounds or {}
-                upper_bounds = entry.data_file.upper_bounds or {}
-                readable_metrics = {
-                    schema.find_column_name(field.field_id): {
-                        "column_size": column_sizes.get(field.field_id),
-                        "value_count": value_counts.get(field.field_id),
-                        "null_value_count": 
null_value_counts.get(field.field_id),
-                        "nan_value_count": 
nan_value_counts.get(field.field_id),
-                        # Makes them readable
-                        "lower_bound": from_bytes(field.field_type, 
lower_bound)
-                        if (lower_bound := lower_bounds.get(field.field_id))
-                        else None,
-                        "upper_bound": from_bytes(field.field_type, 
upper_bound)
-                        if (upper_bound := upper_bounds.get(field.field_id))
-                        else None,
-                    }
-                    for field in self.tbl.metadata.schema().fields
+        for entry in manifest.fetch_manifest_entry(io=self.tbl.io, 
discard_deleted=discard_deleted):
+            column_sizes = entry.data_file.column_sizes or {}
+            value_counts = entry.data_file.value_counts or {}
+            null_value_counts = entry.data_file.null_value_counts or {}
+            nan_value_counts = entry.data_file.nan_value_counts or {}
+            lower_bounds = entry.data_file.lower_bounds or {}
+            upper_bounds = entry.data_file.upper_bounds or {}
+            readable_metrics = {
+                schema.find_column_name(field.field_id): {
+                    "column_size": column_sizes.get(field.field_id),
+                    "value_count": value_counts.get(field.field_id),
+                    "null_value_count": null_value_counts.get(field.field_id),
+                    "nan_value_count": nan_value_counts.get(field.field_id),
+                    # Makes them readable
+                    "lower_bound": from_bytes(field.field_type, lower_bound)
+                    if (lower_bound := lower_bounds.get(field.field_id))
+                    else None,
+                    "upper_bound": from_bytes(field.field_type, upper_bound)
+                    if (upper_bound := upper_bounds.get(field.field_id))
+                    else None,
                 }
+                for field in self.tbl.metadata.schema().fields
+            }
 
-                partition = entry.data_file.partition
-                partition_record_dict = {
-                    field.name: partition[pos]
-                    for pos, field in 
enumerate(self.tbl.metadata.specs()[manifest.partition_spec_id].fields)
-                }
+            partition = entry.data_file.partition
+            partition_record_dict = {
+                field.name: partition[pos]
+                for pos, field in 
enumerate(self.tbl.metadata.specs()[manifest.partition_spec_id].fields)
+            }
 
-                entries.append(
-                    {
-                        "status": entry.status.value,
-                        "snapshot_id": entry.snapshot_id,
-                        "sequence_number": entry.sequence_number,
-                        "file_sequence_number": entry.file_sequence_number,
-                        "data_file": {
-                            "content": entry.data_file.content,
-                            "file_path": entry.data_file.file_path,
-                            "file_format": entry.data_file.file_format,
-                            "partition": partition_record_dict,
-                            "record_count": entry.data_file.record_count,
-                            "file_size_in_bytes": 
entry.data_file.file_size_in_bytes,
-                            "column_sizes": dict(entry.data_file.column_sizes),
-                            "value_counts": dict(entry.data_file.value_counts),
-                            "null_value_counts": 
dict(entry.data_file.null_value_counts),
-                            "nan_value_counts": 
dict(entry.data_file.nan_value_counts),
-                            "lower_bounds": entry.data_file.lower_bounds,
-                            "upper_bounds": entry.data_file.upper_bounds,
-                            "key_metadata": entry.data_file.key_metadata,
-                            "split_offsets": entry.data_file.split_offsets,
-                            "equality_ids": entry.data_file.equality_ids,
-                            "sort_order_id": entry.data_file.sort_order_id,
-                            "spec_id": entry.data_file.spec_id,
-                        },
-                        "readable_metrics": readable_metrics,
-                    }
-                )
+            entries.append(
+                {
+                    "status": entry.status.value,
+                    "snapshot_id": entry.snapshot_id,
+                    "sequence_number": entry.sequence_number,
+                    "file_sequence_number": entry.file_sequence_number,
+                    "data_file": {
+                        "content": entry.data_file.content,
+                        "file_path": entry.data_file.file_path,
+                        "file_format": entry.data_file.file_format,
+                        "partition": partition_record_dict,
+                        "record_count": entry.data_file.record_count,
+                        "file_size_in_bytes": 
entry.data_file.file_size_in_bytes,
+                        "column_sizes": dict(entry.data_file.column_sizes) or 
None,
+                        "value_counts": dict(entry.data_file.value_counts) if 
entry.data_file.value_counts is not None else None,
+                        "null_value_counts": 
dict(entry.data_file.null_value_counts)
+                        if entry.data_file.null_value_counts is not None
+                        else None,
+                        "nan_value_counts": 
dict(entry.data_file.nan_value_counts)
+                        if entry.data_file.nan_value_counts is not None
+                        else None,

Review Comment:
   i used the same logic you used in lines L645 - L650 , if i use dict() or 
None then the code breaks when trying todo dict(None)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] support all_entries in pyiceberg [iceberg-python]

Reply via email to