HungYangChang commented on issue #1806:
URL:
https://github.com/apache/iceberg-python/issues/1806#issuecomment-2734350396
I did some dirty logging in pyiceberg.table.append
```
def append(self, df: pa.Table, snapshot_properties: Dict[str, str] =
EMPTY_DICT) -> None:
"""
Shorthand API for appending a PyArrow table to a table transaction.
Args:
df: The Arrow dataframe that will be appended to overwrite the
table
snapshot_properties: Custom properties to be added to the
snapshot summary
"""
start_append_time = time.time()
try:
import pyarrow as pa
except ModuleNotFoundError as e:
raise ModuleNotFoundError("For writes PyArrow needs to be
installed") from e
from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible,
_dataframe_to_data_files
if not isinstance(df, pa.Table):
raise ValueError(f"Expected PyArrow table, got: {df}")
if unsupported_partitions := [
field for field in self.table_metadata.spec().fields if not
field.transform.supports_pyarrow_transform
]:
raise ValueError(
f"Not all partition types are supported for writes.
Following partitions cannot be written using pyarrow: {unsupported_partitions}."
)
downcast_ns_timestamp_to_us =
Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
_check_pyarrow_schema_compatible(
self.table_metadata.schema(), provided_schema=df.schema,
downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
)
manifest_merge_enabled = property_as_bool(
self.table_metadata.properties,
TableProperties.MANIFEST_MERGE_ENABLED,
TableProperties.MANIFEST_MERGE_ENABLED_DEFAULT,
)
update_snapshot =
self.update_snapshot(snapshot_properties=snapshot_properties)
append_method = update_snapshot.merge_append if
manifest_merge_enabled else update_snapshot.fast_append
logging.info(append_method)
end_time = time.time()
logging.info(f"set up {end_time - start_append_time:.3f} seconds")
with append_method() as append_files:
# skip writing data files if the dataframe is empty
if df.shape[0] > 0:
start_time = time.time()
data_files = _dataframe_to_data_files(
table_metadata=self.table_metadata,
write_uuid=append_files.commit_uuid, df=df, io=self._table.io
)
end_time = time.time()
logging.info(f"_dataframe_to_data_files {end_time -
start_time:.3f} seconds")
start_time = time.time()
for data_file in data_files:
append_files.append_data_file(data_file)
end_time = time.time()
logging.info(f"append_data_file {end_time - start_time:.3f}
seconds")
end_append_time = time.time()
logging.info(f"append_data_file {end_append_time -
start_append_time:.3f} seconds")
```
Here is the result I got:
[2025-03-18T18:35:19.587Z] set up **0.018** seconds
[2025-03-18T18:35:19.605Z] _dataframe_to_data_files **0.000** seconds
[2025-03-18T18:35:20.342Z] append_data_file **0.838** seconds
[2025-03-18T18:35:21.799Z] append_data_file **2.333** seconds
[2025-03-18T18:35:22.413Z] Table append operation took **2.950** seconds
[2025-03-18T18:35:22.483Z] Successfully appended data to table:
inboundrequesteventv2 in **3.393** seconds
[2025-03-18T18:35:22.505Z] Wrote to Iceberg in **3.395** seconds
[2025-03-18T18:35:22.516Z] Total processing time: **3.398** seconds
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]