Fokko commented on code in PR #41: URL: https://github.com/apache/iceberg-python/pull/41#discussion_r1452170208
########## pyiceberg/table/__init__.py: ########## @@ -1910,3 +2006,137 @@ def _generate_snapshot_id() -> int: snapshot_id = snapshot_id if snapshot_id >= 0 else snapshot_id * -1 return snapshot_id + + +@dataclass(frozen=True) +class WriteTask: + write_uuid: uuid.UUID + task_id: int + df: pa.Table + sort_order_id: Optional[int] = None + + # Later to be extended with partition information + + def generate_datafile_filename(self, extension: str) -> str: + # Mimics the behavior in the Java API: + # https://github.com/apache/iceberg/blob/a582968975dd30ff4917fbbe999f1be903efac02/core/src/main/java/org/apache/iceberg/io/OutputFileFactory.java#L92-L101 + return f"00000-{self.task_id}-{self.write_uuid}.{extension}" + + +def _new_manifest_path(location: str, num: int, commit_uuid: uuid.UUID) -> str: + return f'{location}/metadata/{commit_uuid}-m{num}.avro' + + +def _generate_manifest_list_filename(snapshot_id: int, attempt: int, commit_uuid: uuid.UUID) -> str: + # Mimics the behavior in Java: + # https://github.com/apache/iceberg/blob/c862b9177af8e2d83122220764a056f3b96fd00c/core/src/main/java/org/apache/iceberg/SnapshotProducer.java#L491 + return f"snap-{snapshot_id}-{attempt}-{commit_uuid}.avro" + + +def _dataframe_to_data_files(table: Table, df: pa.Table) -> Iterable[DataFile]: + from pyiceberg.io.pyarrow import write_file + + write_uuid = uuid.uuid4() + counter = itertools.count(0) + + # This is an iter, so we don't have to materialize everything every time + # This will be more relevant when we start doing partitioned writes + yield from write_file(table, iter([WriteTask(write_uuid, next(counter), df)])) + + +class _MergeAppend: + _operation: Operation + _table: Table + _snapshot_id: int + _parent_snapshot_id: Optional[int] + _added_datafiles: List[DataFile] + _existing_datafiles: List[DataFile] + _commit_uuid: uuid.UUID + + def __init__(self, operation: Operation, table: Table, snapshot_id: int) -> None: + self._operation = operation + self._table = table + self._snapshot_id = snapshot_id + # Since we only support the main branch for now + self._parent_snapshot_id = snapshot.snapshot_id if (snapshot := self._table.current_snapshot()) else None + self._added_datafiles = [] + self._existing_datafiles = [] + self._commit_uuid = uuid.uuid4() + + def append_datafile(self, data_file: DataFile, added: bool = True) -> _MergeAppend: Review Comment: I got the impression that we wanted to start with normal appends because you suggested removing the `append_manifest` method earlier: https://github.com/apache/iceberg-python/pull/41#discussion_r1429244651. The appending of manifests also introduces complexity in the snapshot generator because then we cannot trust the partition metrics anymore: https://github.com/apache/iceberg/blob/5a1b0d1802e17f92df7c7e98e1e4e7c1486bd37c/core/src/main/java/org/apache/iceberg/SnapshotSummary.java#L141C1-L145 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org