kevinjqliu commented on code in PR #1241: URL: https://github.com/apache/iceberg-python/pull/1241#discussion_r1818300495
########## pyiceberg/table/inspect.py: ########## @@ -32,6 +33,41 @@ from pyiceberg.table import Table +def get_manifests_schema() -> "pa.Schema": Review Comment: nit, can this be moved into the InspectTable class instead of top level? ########## pyiceberg/table/inspect.py: ########## @@ -32,6 +33,41 @@ from pyiceberg.table import Table +def get_manifests_schema() -> "pa.Schema": + import pyarrow as pa + + partition_summary_schema = pa.struct([ + pa.field("contains_null", pa.bool_(), nullable=False), + pa.field("contains_nan", pa.bool_(), nullable=True), + pa.field("lower_bound", pa.string(), nullable=True), + pa.field("upper_bound", pa.string(), nullable=True), + ]) + + manifest_schema = pa.schema([ + pa.field("content", pa.int8(), nullable=False), + pa.field("path", pa.string(), nullable=False), + pa.field("length", pa.int64(), nullable=False), + pa.field("partition_spec_id", pa.int32(), nullable=False), + pa.field("added_snapshot_id", pa.int64(), nullable=False), + pa.field("added_data_files_count", pa.int32(), nullable=False), + pa.field("existing_data_files_count", pa.int32(), nullable=False), + pa.field("deleted_data_files_count", pa.int32(), nullable=False), + pa.field("added_delete_files_count", pa.int32(), nullable=False), + pa.field("existing_delete_files_count", pa.int32(), nullable=False), + pa.field("deleted_delete_files_count", pa.int32(), nullable=False), + pa.field("partition_summaries", pa.list_(partition_summary_schema), nullable=False), + ]) + return manifest_schema + + +def get_all_manifests_schema() -> "pa.Schema": + import pyarrow as pa + + all_manifests_schema = get_manifests_schema() + all_manifests_schema = all_manifests_schema.append(pa.field("reference_snapshot_id", pa.int64(), nullable=False)) Review Comment: interestingly, this isnt in the documentation https://iceberg.apache.org/docs/latest/spark-queries/#all-manifests but only in the code https://github.com/apache/iceberg/blame/2b55fef7cc2a249d864ac26d85a4923313d96a59/core/src/main/java/org/apache/iceberg/AllManifestsTable.java#L53-L54 ########## pyiceberg/table/inspect.py: ########## @@ -405,13 +419,19 @@ def _partition_summaries_to_rows( "partition_summaries": _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions) if manifest.partitions else [], - }) + } + if is_all_manifests_table: + manifest_row["reference_snapshot_id"] = snapshot.snapshot_id + manifests.append(manifest_row) return pa.Table.from_pylist( manifests, - schema=manifest_schema, + schema=get_all_manifests_schema() if is_all_manifests_table else get_manifests_schema(), ) + def manifests(self) -> "pa.Table": Review Comment: wdyt about adding an optional `snapshot_id` here? To allow users to look at the manifest for a specific snapshot, with the added benefit to iterate over all snapshot ids for `all_manifests` ########## pyiceberg/table/inspect.py: ########## @@ -405,13 +419,19 @@ def _partition_summaries_to_rows( "partition_summaries": _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions) if manifest.partitions else [], - }) + } + if is_all_manifests_table: + manifest_row["reference_snapshot_id"] = snapshot.snapshot_id Review Comment: were you able to find this logic in the java implementation? ########## tests/integration/test_inspect_table.py: ########## @@ -846,3 +846,95 @@ def inspect_files_asserts(df: pa.Table) -> None: inspect_files_asserts(files_df) inspect_files_asserts(data_files_df) inspect_files_asserts(delete_files_df) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_inspect_all_manifests(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: + identifier = "default.table_metadata_all_manifests" + try: + session_catalog.drop_table(identifier=identifier) + except NoSuchTableError: + pass + + spark.sql( + f""" + CREATE TABLE {identifier} ( + id int, + data string + ) + PARTITIONED BY (data) + TBLPROPERTIES ('write.update.mode'='merge-on-read', + 'write.delete.mode'='merge-on-read') + """ + ) + tbl = session_catalog.load_table(identifier) + + # check all_manifests when there are no snapshots + lhs = tbl.inspect.all_manifests().to_pandas() + rhs = spark.table(f"{identifier}.all_manifests").toPandas() + assert lhs.empty + assert rhs.empty + + spark.sql(f"INSERT INTO {identifier} VALUES (1, 'a')") + + spark.sql(f"INSERT INTO {identifier} VALUES (2, 'b')") + + spark.sql(f"UPDATE {identifier} SET data = 'c' WHERE id = 1") + + spark.sql(f"DELETE FROM {identifier} WHERE id = 2") + + spark.sql(f"INSERT OVERWRITE {identifier} VALUES (1, 'a')") + + df = tbl.inspect.all_manifests() + + assert df.column_names == [ + "content", + "path", + "length", + "partition_spec_id", + "added_snapshot_id", + "added_data_files_count", + "existing_data_files_count", + "deleted_data_files_count", + "added_delete_files_count", + "existing_delete_files_count", + "deleted_delete_files_count", + "partition_summaries", + "reference_snapshot_id", + ] + + int_cols = [ + "content", + "length", + "partition_spec_id", + "added_snapshot_id", + "added_data_files_count", + "existing_data_files_count", + "deleted_data_files_count", + "added_delete_files_count", + "existing_delete_files_count", + "deleted_delete_files_count", + "reference_snapshot_id", + ] + + for column in int_cols: + for value in df[column]: + assert isinstance(value.as_py(), int) + + for value in df["path"]: + assert isinstance(value.as_py(), str) + + for value in df["partition_summaries"]: + assert isinstance(value.as_py(), list) + for row in value: + assert isinstance(row["contains_null"].as_py(), bool) + assert isinstance(row["contains_nan"].as_py(), (bool, type(None))) + assert isinstance(row["lower_bound"].as_py(), (str, type(None))) + assert isinstance(row["upper_bound"].as_py(), (str, type(None))) + + lhs = spark.table(f"{identifier}.all_manifests").toPandas() + rhs = df.to_pandas() + for column in df.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + assert left == right, f"Difference in column {column}: {left} != {right}" Review Comment: nit: is it possible to use `assert_frame_equal` here? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org