pdames commented on code in PR #140: URL: https://github.com/apache/iceberg-python/pull/140#discussion_r1436629843
########## pyiceberg/catalog/__init__.py: ########## @@ -587,8 +590,34 @@ def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> ToOutputFile.table_metadata(metadata, io.new_output(metadata_path)) @staticmethod - def _get_metadata_location(location: str) -> str: - return f"{location}/metadata/00000-{uuid.uuid4()}.metadata.json" + def _get_metadata_location(location: str, new_version: int = 0) -> str: + if new_version < 0: + raise ValueError(f"Table metadata version: {new_version} cannot be negative") Review Comment: Should we also return a more friendly error in the off chance that new_version is `None`? ```suggestion if not new_version or new_version < 0: raise ValueError(f"Table metadata version: {new_version} must be a positive integer") ``` ########## pyiceberg/catalog/glue.py: ########## @@ -177,6 +198,28 @@ def _create_glue_table(self, database_name: str, table_name: str, table_input: T except self.glue.exceptions.EntityNotFoundException as e: raise NoSuchNamespaceError(f"Database {database_name} does not exist") from e + def _update_glue_table(self, database_name: str, table_name: str, table_input: TableInputTypeDef, version_id: str) -> None: + try: + self.glue.update_table( + DatabaseName=database_name, + TableInput=table_input, + SkipArchive=self.properties.get(GLUE_SKIP_ARCHIVE, GLUE_SKIP_ARCHIVE_DEFAULT), + VersionId=version_id, + ) + except self.glue.exceptions.EntityNotFoundException as e: + raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") from e Review Comment: Should we also add the Glue table version here? ```suggestion raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name} (Glue table version {version_id})") from e ``` ########## pyiceberg/catalog/__init__.py: ########## @@ -74,6 +75,8 @@ LOCATION = "location" EXTERNAL_TABLE = "EXTERNAL_TABLE" +TABLE_METADATA_FILE_NAME_REGEX = re.compile(r"""(\d+)-.*\.json""", re.X) Review Comment: To better avoid potential false positives, should we change this to: ```suggestion TABLE_METADATA_FILE_NAME_REGEX = re.compile(r"""(\d+)-(.*)\.metadata\.json""", re.X) ``` Where the 2nd capturing group will also allow us to assert UUID validity later. I'm assuming we don't want to match non-UUID strings or arbitrary JSON files, but let me know if you think this is too restrictive. Also, since we've specified the `re.X` flag anyway, we can make use of it by adding some comments here like: ```suggestion TABLE_METADATA_FILE_NAME_REGEX = re.compile( r""" (\d+) # version number - # separator (.*) # UUID \.metadata\.json # file extension """, re.X ) ``` ########## pyiceberg/catalog/__init__.py: ########## @@ -587,8 +590,34 @@ def _write_metadata(metadata: TableMetadata, io: FileIO, metadata_path: str) -> ToOutputFile.table_metadata(metadata, io.new_output(metadata_path)) @staticmethod - def _get_metadata_location(location: str) -> str: - return f"{location}/metadata/00000-{uuid.uuid4()}.metadata.json" + def _get_metadata_location(location: str, new_version: int = 0) -> str: + if new_version < 0: + raise ValueError(f"Table metadata version: {new_version} cannot be negative") + version_str = f"{new_version:05d}" + return f"{location}/metadata/{version_str}-{uuid.uuid4()}.metadata.json" + + @staticmethod + def _parse_metadata_version(metadata_location: str) -> int: + """Parse the version from the metadata location. + + The version is the first part of the file name, before the first dash. + For example, the version of the metadata file + `s3://bucket/db/tb/metadata/00001-6c97e413-d51b-4538-ac70-12fe2a85cb83.metadata.json` + is 1. + If the path does not comply with the pattern, the version is defaulted to be -1, ensuring + that the next metadata file is treated as having version 0. + + Args: + metadata_location (str): The location of the metadata file. + + Returns: + int: The version of the metadata file. -1 if the file name does not have valid version string + """ + file_name = metadata_location.split("/")[-1] Review Comment: Should we also gracefully return `-1` if `metadata_location` happens to be undefined? ```suggestion file_name = metadata_location.split("/")[-1] if metadata_location else "" ``` ########## pyiceberg/catalog/glue.py: ########## @@ -247,8 +290,52 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons Raises: NoSuchTableError: If a table with the given identifier does not exist. + CommitFailedException: If the commit failed. """ - raise NotImplementedError + identifier_tuple = self.identifier_to_tuple_without_catalog( + tuple(table_request.identifier.namespace.root + [table_request.identifier.name]) + ) + database_name, table_name = self.identifier_to_database_and_table(identifier_tuple) + + current_glue_table = self._get_glue_table(database_name=database_name, table_name=table_name) + glue_table_version_id = current_glue_table.get("VersionId") + if glue_table_version_id is None: Review Comment: Would it be safer/more-robust to check for any type of undefined table version ID (since the Glue API docs don't seem to promise either an omission of `VersionId` or providing an explicit `None` in the response)? ```suggestion if not glue_table_version_id: ``` ########## pyiceberg/catalog/glue.py: ########## @@ -177,6 +198,28 @@ def _create_glue_table(self, database_name: str, table_name: str, table_input: T except self.glue.exceptions.EntityNotFoundException as e: raise NoSuchNamespaceError(f"Database {database_name} does not exist") from e + def _update_glue_table(self, database_name: str, table_name: str, table_input: TableInputTypeDef, version_id: str) -> None: + try: + self.glue.update_table( + DatabaseName=database_name, + TableInput=table_input, + SkipArchive=self.properties.get(GLUE_SKIP_ARCHIVE, GLUE_SKIP_ARCHIVE_DEFAULT), + VersionId=version_id, + ) + except self.glue.exceptions.EntityNotFoundException as e: + raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") from e + except self.glue.exceptions.ConcurrentModificationException as e: + raise CommitFailedException( + f"Cannot commit {database_name}.{table_name} because Glue detected concurrent update" Review Comment: Should we also add the Glue table version number that we attempted to update? ```suggestion f"Cannot commit {database_name}.{table_name} because Glue detected concurrent update to table version {version_id}" ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org