Re: [PR] support all_entries in pyiceberg [iceberg-python]

via GitHub Sun, 16 Feb 2025 13:32:39 -0800


kevinjqliu commented on code in PR #1608:
URL: https://github.com/apache/iceberg-python/pull/1608#discussion_r1957418226



##########
pyiceberg/table/inspect.py:
##########
@@ -157,74 +158,96 @@ def _readable_metrics_struct(bound_type: PrimitiveType) 
-> pa.StructType:
                 pa.field("readable_metrics", 
pa.struct(readable_metrics_struct), nullable=True),
             ]
         )
+        return entries_schema
+
+    def _get_entries(self, schema: "pa.Schema", manifest: ManifestFile, 
discard_deleted: bool = True) -> "pa.Table":

Review Comment:
   schema here should be iceberg schema instead of pa.schema



##########
pyiceberg/table/inspect.py:
##########
@@ -157,74 +158,96 @@ def _readable_metrics_struct(bound_type: PrimitiveType) 
-> pa.StructType:
                 pa.field("readable_metrics", 
pa.struct(readable_metrics_struct), nullable=True),
             ]
         )
+        return entries_schema
+
+    def _get_entries(self, schema: "pa.Schema", manifest: ManifestFile, 
discard_deleted: bool = True) -> "pa.Table":

Review Comment:
   there are still a couple places in this function using `self.tbl.` which is 
the table's current metadata
   L189 & L195.
   We should use the schema and partition spec at the time of the snapshot 
instead



##########
mkdocs/docs/api.md:
##########
@@ -716,6 +716,8 @@ readable_metrics: [
 [6.0989]]
 ```
 
+To show all the table's current manifest entries for both data and delete 
files, use `table.inspect.all_entries()`.

Review Comment:
   i'd be great to add this to its own section, similar to 
https://iceberg.apache.org/docs/nightly/spark-queries/#all-metadata-tables



##########
pyiceberg/table/inspect.py:
##########
@@ -157,74 +158,96 @@ def _readable_metrics_struct(bound_type: PrimitiveType) 
-> pa.StructType:
                 pa.field("readable_metrics", 
pa.struct(readable_metrics_struct), nullable=True),
             ]
         )
+        return entries_schema
+
+    def _get_entries(self, schema: "pa.Schema", manifest: ManifestFile, 
discard_deleted: bool = True) -> "pa.Table":
+        import pyarrow as pa
 
+        entries_schema = self._get_entries_schema()
         entries = []
-        snapshot = self._get_snapshot(snapshot_id)
-        for manifest in snapshot.manifests(self.tbl.io):
-            for entry in manifest.fetch_manifest_entry(io=self.tbl.io):
-                column_sizes = entry.data_file.column_sizes or {}
-                value_counts = entry.data_file.value_counts or {}
-                null_value_counts = entry.data_file.null_value_counts or {}
-                nan_value_counts = entry.data_file.nan_value_counts or {}
-                lower_bounds = entry.data_file.lower_bounds or {}
-                upper_bounds = entry.data_file.upper_bounds or {}
-                readable_metrics = {
-                    schema.find_column_name(field.field_id): {
-                        "column_size": column_sizes.get(field.field_id),
-                        "value_count": value_counts.get(field.field_id),
-                        "null_value_count": 
null_value_counts.get(field.field_id),
-                        "nan_value_count": 
nan_value_counts.get(field.field_id),
-                        # Makes them readable
-                        "lower_bound": from_bytes(field.field_type, 
lower_bound)
-                        if (lower_bound := lower_bounds.get(field.field_id))
-                        else None,
-                        "upper_bound": from_bytes(field.field_type, 
upper_bound)
-                        if (upper_bound := upper_bounds.get(field.field_id))
-                        else None,
-                    }
-                    for field in self.tbl.metadata.schema().fields
+        for entry in manifest.fetch_manifest_entry(io=self.tbl.io, 
discard_deleted=discard_deleted):
+            column_sizes = entry.data_file.column_sizes or {}
+            value_counts = entry.data_file.value_counts or {}
+            null_value_counts = entry.data_file.null_value_counts or {}
+            nan_value_counts = entry.data_file.nan_value_counts or {}
+            lower_bounds = entry.data_file.lower_bounds or {}
+            upper_bounds = entry.data_file.upper_bounds or {}
+            readable_metrics = {
+                schema.find_column_name(field.field_id): {
+                    "column_size": column_sizes.get(field.field_id),
+                    "value_count": value_counts.get(field.field_id),
+                    "null_value_count": null_value_counts.get(field.field_id),
+                    "nan_value_count": nan_value_counts.get(field.field_id),
+                    # Makes them readable
+                    "lower_bound": from_bytes(field.field_type, lower_bound)
+                    if (lower_bound := lower_bounds.get(field.field_id))
+                    else None,
+                    "upper_bound": from_bytes(field.field_type, upper_bound)
+                    if (upper_bound := upper_bounds.get(field.field_id))
+                    else None,
                 }
+                for field in self.tbl.metadata.schema().fields
+            }
 
-                partition = entry.data_file.partition
-                partition_record_dict = {
-                    field.name: partition[pos]
-                    for pos, field in 
enumerate(self.tbl.metadata.specs()[manifest.partition_spec_id].fields)
-                }
+            partition = entry.data_file.partition
+            partition_record_dict = {
+                field.name: partition[pos]
+                for pos, field in 
enumerate(self.tbl.metadata.specs()[manifest.partition_spec_id].fields)
+            }
 
-                entries.append(
-                    {
-                        "status": entry.status.value,
-                        "snapshot_id": entry.snapshot_id,
-                        "sequence_number": entry.sequence_number,
-                        "file_sequence_number": entry.file_sequence_number,
-                        "data_file": {
-                            "content": entry.data_file.content,
-                            "file_path": entry.data_file.file_path,
-                            "file_format": entry.data_file.file_format,
-                            "partition": partition_record_dict,
-                            "record_count": entry.data_file.record_count,
-                            "file_size_in_bytes": 
entry.data_file.file_size_in_bytes,
-                            "column_sizes": dict(entry.data_file.column_sizes),
-                            "value_counts": dict(entry.data_file.value_counts),
-                            "null_value_counts": 
dict(entry.data_file.null_value_counts),
-                            "nan_value_counts": 
dict(entry.data_file.nan_value_counts),
-                            "lower_bounds": entry.data_file.lower_bounds,
-                            "upper_bounds": entry.data_file.upper_bounds,
-                            "key_metadata": entry.data_file.key_metadata,
-                            "split_offsets": entry.data_file.split_offsets,
-                            "equality_ids": entry.data_file.equality_ids,
-                            "sort_order_id": entry.data_file.sort_order_id,
-                            "spec_id": entry.data_file.spec_id,
-                        },
-                        "readable_metrics": readable_metrics,
-                    }
-                )
+            entries.append(
+                {
+                    "status": entry.status.value,
+                    "snapshot_id": entry.snapshot_id,
+                    "sequence_number": entry.sequence_number,
+                    "file_sequence_number": entry.file_sequence_number,
+                    "data_file": {
+                        "content": entry.data_file.content,
+                        "file_path": entry.data_file.file_path,
+                        "file_format": entry.data_file.file_format,
+                        "partition": partition_record_dict,
+                        "record_count": entry.data_file.record_count,
+                        "file_size_in_bytes": 
entry.data_file.file_size_in_bytes,
+                        "column_sizes": dict(entry.data_file.column_sizes) or 
None,
+                        "value_counts": dict(entry.data_file.value_counts) if 
entry.data_file.value_counts is not None else None,
+                        "null_value_counts": 
dict(entry.data_file.null_value_counts)
+                        if entry.data_file.null_value_counts is not None
+                        else None,
+                        "nan_value_counts": 
dict(entry.data_file.nan_value_counts)
+                        if entry.data_file.nan_value_counts is not None
+                        else None,

Review Comment:
   nit: can we use the same dict() or None pattern here



##########
pyiceberg/table/inspect.py:
##########
@@ -94,7 +94,7 @@ def snapshots(self) -> "pa.Table":
             schema=snapshots_schema,
         )
 
-    def entries(self, snapshot_id: Optional[int] = None) -> "pa.Table":
+    def _get_entries_schema(self) -> "pa.Schema":

Review Comment:
   same problem here using the table's current metadata (`self.tbl.`)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] support all_entries in pyiceberg [iceberg-python]

Reply via email to