Re: [PR] Add `all_manifests` metadata table with tests [iceberg-python]

via GitHub Sun, 27 Oct 2024 20:08:40 -0700


kevinjqliu commented on code in PR #1241:
URL: https://github.com/apache/iceberg-python/pull/1241#discussion_r1818300495



##########
pyiceberg/table/inspect.py:
##########
@@ -32,6 +33,41 @@
     from pyiceberg.table import Table
 
 
+def get_manifests_schema() -> "pa.Schema":

Review Comment:
   nit, can this be moved into the InspectTable class instead of top level?



##########
pyiceberg/table/inspect.py:
##########
@@ -32,6 +33,41 @@
     from pyiceberg.table import Table
 
 
+def get_manifests_schema() -> "pa.Schema":
+    import pyarrow as pa
+
+    partition_summary_schema = pa.struct([
+        pa.field("contains_null", pa.bool_(), nullable=False),
+        pa.field("contains_nan", pa.bool_(), nullable=True),
+        pa.field("lower_bound", pa.string(), nullable=True),
+        pa.field("upper_bound", pa.string(), nullable=True),
+    ])
+
+    manifest_schema = pa.schema([
+        pa.field("content", pa.int8(), nullable=False),
+        pa.field("path", pa.string(), nullable=False),
+        pa.field("length", pa.int64(), nullable=False),
+        pa.field("partition_spec_id", pa.int32(), nullable=False),
+        pa.field("added_snapshot_id", pa.int64(), nullable=False),
+        pa.field("added_data_files_count", pa.int32(), nullable=False),
+        pa.field("existing_data_files_count", pa.int32(), nullable=False),
+        pa.field("deleted_data_files_count", pa.int32(), nullable=False),
+        pa.field("added_delete_files_count", pa.int32(), nullable=False),
+        pa.field("existing_delete_files_count", pa.int32(), nullable=False),
+        pa.field("deleted_delete_files_count", pa.int32(), nullable=False),
+        pa.field("partition_summaries", pa.list_(partition_summary_schema), 
nullable=False),
+    ])
+    return manifest_schema
+
+
+def get_all_manifests_schema() -> "pa.Schema":
+    import pyarrow as pa
+
+    all_manifests_schema = get_manifests_schema()
+    all_manifests_schema = 
all_manifests_schema.append(pa.field("reference_snapshot_id", pa.int64(), 
nullable=False))

Review Comment:
   interestingly, this isnt in the documentation 
https://iceberg.apache.org/docs/latest/spark-queries/#all-manifests 
   but only in the code 
https://github.com/apache/iceberg/blame/2b55fef7cc2a249d864ac26d85a4923313d96a59/core/src/main/java/org/apache/iceberg/AllManifestsTable.java#L53-L54



##########
pyiceberg/table/inspect.py:
##########
@@ -405,13 +419,19 @@ def _partition_summaries_to_rows(
                     "partition_summaries": 
_partition_summaries_to_rows(specs[manifest.partition_spec_id], 
manifest.partitions)
                     if manifest.partitions
                     else [],
-                })
+                }
+                if is_all_manifests_table:
+                    manifest_row["reference_snapshot_id"] = 
snapshot.snapshot_id
+                manifests.append(manifest_row)
 
         return pa.Table.from_pylist(
             manifests,
-            schema=manifest_schema,
+            schema=get_all_manifests_schema() if is_all_manifests_table else 
get_manifests_schema(),
         )
 
+    def manifests(self) -> "pa.Table":

Review Comment:
   wdyt about adding an optional `snapshot_id` here? To allow users to look at 
the manifest for a specific snapshot, with the added benefit to iterate over 
all snapshot ids for `all_manifests`



##########
pyiceberg/table/inspect.py:
##########
@@ -405,13 +419,19 @@ def _partition_summaries_to_rows(
                     "partition_summaries": 
_partition_summaries_to_rows(specs[manifest.partition_spec_id], 
manifest.partitions)
                     if manifest.partitions
                     else [],
-                })
+                }
+                if is_all_manifests_table:
+                    manifest_row["reference_snapshot_id"] = 
snapshot.snapshot_id

Review Comment:
   were you able to find this logic in the java implementation?



##########
tests/integration/test_inspect_table.py:
##########
@@ -846,3 +846,95 @@ def inspect_files_asserts(df: pa.Table) -> None:
     inspect_files_asserts(files_df)
     inspect_files_asserts(data_files_df)
     inspect_files_asserts(delete_files_df)
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_inspect_all_manifests(spark: SparkSession, session_catalog: Catalog, 
format_version: int) -> None:
+    identifier = "default.table_metadata_all_manifests"
+    try:
+        session_catalog.drop_table(identifier=identifier)
+    except NoSuchTableError:
+        pass
+
+    spark.sql(
+        f"""
+        CREATE TABLE {identifier} (
+            id int,
+            data string
+        )
+        PARTITIONED BY (data)
+        TBLPROPERTIES ('write.update.mode'='merge-on-read',
+                       'write.delete.mode'='merge-on-read')
+    """
+    )
+    tbl = session_catalog.load_table(identifier)
+
+    # check all_manifests when there are no snapshots
+    lhs = tbl.inspect.all_manifests().to_pandas()
+    rhs = spark.table(f"{identifier}.all_manifests").toPandas()
+    assert lhs.empty
+    assert rhs.empty
+
+    spark.sql(f"INSERT INTO {identifier} VALUES (1, 'a')")
+
+    spark.sql(f"INSERT INTO {identifier} VALUES (2, 'b')")
+
+    spark.sql(f"UPDATE {identifier} SET data = 'c' WHERE id = 1")
+
+    spark.sql(f"DELETE FROM {identifier} WHERE id = 2")
+
+    spark.sql(f"INSERT OVERWRITE {identifier} VALUES (1, 'a')")
+
+    df = tbl.inspect.all_manifests()
+
+    assert df.column_names == [
+        "content",
+        "path",
+        "length",
+        "partition_spec_id",
+        "added_snapshot_id",
+        "added_data_files_count",
+        "existing_data_files_count",
+        "deleted_data_files_count",
+        "added_delete_files_count",
+        "existing_delete_files_count",
+        "deleted_delete_files_count",
+        "partition_summaries",
+        "reference_snapshot_id",
+    ]
+
+    int_cols = [
+        "content",
+        "length",
+        "partition_spec_id",
+        "added_snapshot_id",
+        "added_data_files_count",
+        "existing_data_files_count",
+        "deleted_data_files_count",
+        "added_delete_files_count",
+        "existing_delete_files_count",
+        "deleted_delete_files_count",
+        "reference_snapshot_id",
+    ]
+
+    for column in int_cols:
+        for value in df[column]:
+            assert isinstance(value.as_py(), int)
+
+    for value in df["path"]:
+        assert isinstance(value.as_py(), str)
+
+    for value in df["partition_summaries"]:
+        assert isinstance(value.as_py(), list)
+        for row in value:
+            assert isinstance(row["contains_null"].as_py(), bool)
+            assert isinstance(row["contains_nan"].as_py(), (bool, type(None)))
+            assert isinstance(row["lower_bound"].as_py(), (str, type(None)))
+            assert isinstance(row["upper_bound"].as_py(), (str, type(None)))
+
+    lhs = spark.table(f"{identifier}.all_manifests").toPandas()
+    rhs = df.to_pandas()
+    for column in df.column_names:
+        for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
+            assert left == right, f"Difference in column {column}: {left} != 
{right}"

Review Comment:
   nit: is it possible to use `assert_frame_equal` here? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Add `all_manifests` metadata table with tests [iceberg-python]

Reply via email to