HonahX commented on code in PR #288: URL: https://github.com/apache/iceberg-python/pull/288#discussion_r1460723261
########## pyiceberg/catalog/glue.py: ########## @@ -84,19 +110,105 @@ def _construct_parameters( return new_parameters +def _type_to_glue_type_string(input_type: IcebergType) -> str: + if isinstance(input_type, BooleanType): + return "boolean" + if isinstance(input_type, IntegerType): + return "int" + if isinstance(input_type, LongType): + return "bigint" + if isinstance(input_type, FloatType): + return "float" + if isinstance(input_type, DoubleType): + return "double" + if isinstance(input_type, DateType): + return "date" + if isinstance( + input_type, + ( + TimeType, + StringType, + UUIDType, + ), + ): + return "string" + if isinstance(input_type, TimestampType): + return "timestamp" + if isinstance( + input_type, + ( + FixedType, + BinaryType, + ), + ): + return "binary" + if isinstance(input_type, DecimalType): + return f"decimal({input_type.precision},{input_type.scale})" + if isinstance(input_type, StructType): + name_to_type = ",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in input_type.fields) + return f"struct<{name_to_type}>" + if isinstance(input_type, ListType): + return f"array<{_type_to_glue_type_string(input_type.element_type)}>" + if isinstance(input_type, MapType): + return f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>" + + raise ValueError(f"Unknown Type {input_type}") + + +def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]: + results: Dict[str, ColumnTypeDef] = {} + + def _append_to_results(field: NestedField, is_current: bool) -> None: + if field.name in results: + return + + results[field.name] = cast( + ColumnTypeDef, + { + "Name": field.name, + "Type": _type_to_glue_type_string(field.field_type), + "Parameters": { + ICEBERG_FIELD_ID: str(field.field_id), + ICEBERG_FIELD_OPTIONAL: str(field.optional).lower(), + ICEBERG_FIELD_CURRENT: str(is_current).lower(), + }, + }, + ) + if field.doc: + results[field.name]["Comment"] = field.doc + + if current_schema := metadata.schema_by_id(metadata.current_schema_id): + for field in current_schema.columns: + _append_to_results(field, True) + + for schema in metadata.schemas: + if schema.schema_id == metadata.current_schema_id: + continue + for field in schema.columns: + _append_to_results(field, False) + + return list(results.values()) + + def _construct_table_input( table_name: str, metadata_location: str, properties: Properties, + metadata: TableMetadataCommonFields, Review Comment: ```suggestion metadata: TableMetadata, ``` ########## pyiceberg/catalog/glue.py: ########## @@ -84,19 +110,105 @@ def _construct_parameters( return new_parameters +def _type_to_glue_type_string(input_type: IcebergType) -> str: + if isinstance(input_type, BooleanType): + return "boolean" + if isinstance(input_type, IntegerType): + return "int" + if isinstance(input_type, LongType): + return "bigint" + if isinstance(input_type, FloatType): + return "float" + if isinstance(input_type, DoubleType): + return "double" + if isinstance(input_type, DateType): + return "date" + if isinstance( + input_type, + ( + TimeType, + StringType, + UUIDType, + ), + ): + return "string" + if isinstance(input_type, TimestampType): + return "timestamp" + if isinstance( + input_type, + ( + FixedType, + BinaryType, + ), + ): + return "binary" + if isinstance(input_type, DecimalType): + return f"decimal({input_type.precision},{input_type.scale})" + if isinstance(input_type, StructType): + name_to_type = ",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in input_type.fields) + return f"struct<{name_to_type}>" + if isinstance(input_type, ListType): + return f"array<{_type_to_glue_type_string(input_type.element_type)}>" + if isinstance(input_type, MapType): + return f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>" + + raise ValueError(f"Unknown Type {input_type}") + + +def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]: Review Comment: ```suggestion def _to_columns(metadata: TableMetadata) -> List[ColumnTypeDef]: ``` ########## pyiceberg/catalog/glue.py: ########## @@ -84,19 +110,105 @@ def _construct_parameters( return new_parameters +def _type_to_glue_type_string(input_type: IcebergType) -> str: + if isinstance(input_type, BooleanType): + return "boolean" + if isinstance(input_type, IntegerType): + return "int" + if isinstance(input_type, LongType): + return "bigint" + if isinstance(input_type, FloatType): + return "float" + if isinstance(input_type, DoubleType): + return "double" + if isinstance(input_type, DateType): + return "date" + if isinstance( + input_type, + ( + TimeType, + StringType, + UUIDType, + ), + ): + return "string" + if isinstance(input_type, TimestampType): + return "timestamp" + if isinstance( + input_type, + ( + FixedType, + BinaryType, + ), + ): + return "binary" + if isinstance(input_type, DecimalType): + return f"decimal({input_type.precision},{input_type.scale})" + if isinstance(input_type, StructType): + name_to_type = ",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in input_type.fields) + return f"struct<{name_to_type}>" + if isinstance(input_type, ListType): + return f"array<{_type_to_glue_type_string(input_type.element_type)}>" + if isinstance(input_type, MapType): + return f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>" + + raise ValueError(f"Unknown Type {input_type}") + + +def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]: + results: Dict[str, ColumnTypeDef] = {} + + def _append_to_results(field: NestedField, is_current: bool) -> None: + if field.name in results: + return + + results[field.name] = cast( + ColumnTypeDef, + { + "Name": field.name, + "Type": _type_to_glue_type_string(field.field_type), + "Parameters": { + ICEBERG_FIELD_ID: str(field.field_id), + ICEBERG_FIELD_OPTIONAL: str(field.optional).lower(), + ICEBERG_FIELD_CURRENT: str(is_current).lower(), + }, + }, + ) + if field.doc: + results[field.name]["Comment"] = field.doc + + if current_schema := metadata.schema_by_id(metadata.current_schema_id): + for field in current_schema.columns: + _append_to_results(field, True) + + for schema in metadata.schemas: + if schema.schema_id == metadata.current_schema_id: + continue + for field in schema.columns: + _append_to_results(field, False) + + return list(results.values()) + + def _construct_table_input( table_name: str, metadata_location: str, properties: Properties, + metadata: TableMetadataCommonFields, + location: Optional[str] = None, glue_table: Optional[TableTypeDef] = None, prev_metadata_location: Optional[str] = None, ) -> TableInputTypeDef: table_input: TableInputTypeDef = { "Name": table_name, "TableType": EXTERNAL_TABLE, "Parameters": _construct_parameters(metadata_location, glue_table, prev_metadata_location), + "StorageDescriptor": {"Columns": _to_columns(metadata)}, } + if location: + table_input["StorageDescriptor"]["Location"] = location Review Comment: The `location` is a required field in metadata, so we can fetch it from the metadata directly: ```python table_input["StorageDescriptor"]["Location"] = metadata.location ``` and we can safely delete the "location" parameter ########## pyiceberg/catalog/glue.py: ########## @@ -84,19 +110,105 @@ def _construct_parameters( return new_parameters +def _type_to_glue_type_string(input_type: IcebergType) -> str: Review Comment: I think we might utilize the existing schema visitor for this task: https://github.com/apache/iceberg-python/blob/4616d036440bf4cb3733e8d091220587cf290b75/pyiceberg/schema.py#L345-L348 https://github.com/apache/iceberg-python/blob/4616d036440bf4cb3733e8d091220587cf290b75/pyiceberg/schema.py#L784-L797 We could implement a `_TypeToGlueTypeString` implementing the `SchemaVisitor[str]` like this ```python class _TypeToGlueTypeString(SchemaVisitor[str]): def schema(self, schema: Schema, struct_result: str) -> str: return struct_result def struct(self, struct: StructType, field_results: List[str]) -> str: name_to_type = ",".join(field_results) return f"struct<{name_to_type}>" def field(self, field: NestedField, field_result: str) -> str: return f"{field.name}:{field_result}" def list(self, list_type: ListType, element_result: str) -> str: return f"array<{element_result}>" def map(self, map_type: MapType, key_result: str, value_result: str) -> str: return f"map<{key_result},{value_result}>" def primitive(self, primitive: IcebergType) -> str: if isinstance(primitive, BooleanType): return "boolean" elif isinstance(primitive, IntegerType): return "int" elif isinstance(primitive, LongType): return "bigint" ...(more types) raise ValueError(f"Unknown Type {primitive}") ``` This can also let us benefit from the `@singledispatch` decorator, which make the visitor faster in practice. Does this idea sound reasonable to you? ########## tests/catalog/test_glue.py: ########## @@ -554,6 +580,19 @@ def test_commit_table_update_schema( assert new_schema == update._apply() assert new_schema.find_field("b").field_type == IntegerType() + # Ensure schema is also pushed to Glue + table_info = _glue.get_table( + DatabaseName=database_name, + Name=table_name, + ) + columns = table_info["Table"]["StorageDescriptor"]["Columns"] + assert len(columns) == len(table_schema_nested.fields) + 1 + assert columns[-1] == { + "Name": "b", + "Type": "int", + "Parameters": {"iceberg.field.id": "18", "iceberg.field.optional": "true", "iceberg.field.current": "true"}, + } + Review Comment: Shall we also check the StorageDscriptor.location here? ```suggestion assert table_info["Table"]["StorageDescriptor"]["Location"] == f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}" ``` ########## pyiceberg/catalog/glue.py: ########## @@ -84,19 +110,105 @@ def _construct_parameters( return new_parameters +def _type_to_glue_type_string(input_type: IcebergType) -> str: + if isinstance(input_type, BooleanType): + return "boolean" + if isinstance(input_type, IntegerType): + return "int" + if isinstance(input_type, LongType): + return "bigint" + if isinstance(input_type, FloatType): + return "float" + if isinstance(input_type, DoubleType): + return "double" + if isinstance(input_type, DateType): + return "date" + if isinstance( + input_type, + ( + TimeType, + StringType, + UUIDType, + ), + ): + return "string" + if isinstance(input_type, TimestampType): + return "timestamp" + if isinstance( + input_type, + ( + FixedType, + BinaryType, + ), + ): + return "binary" + if isinstance(input_type, DecimalType): + return f"decimal({input_type.precision},{input_type.scale})" + if isinstance(input_type, StructType): + name_to_type = ",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in input_type.fields) + return f"struct<{name_to_type}>" + if isinstance(input_type, ListType): + return f"array<{_type_to_glue_type_string(input_type.element_type)}>" + if isinstance(input_type, MapType): + return f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>" + + raise ValueError(f"Unknown Type {input_type}") + + +def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]: + results: Dict[str, ColumnTypeDef] = {} + + def _append_to_results(field: NestedField, is_current: bool) -> None: + if field.name in results: + return + + results[field.name] = cast( + ColumnTypeDef, + { + "Name": field.name, + "Type": _type_to_glue_type_string(field.field_type), Review Comment: If we decide to use the existing visitor as mentioned above, this will become ```python "Type": visit(field.field_type, _TypeToGlueTypeString()) ``` ########## pyiceberg/catalog/glue.py: ########## @@ -84,19 +110,105 @@ def _construct_parameters( return new_parameters +def _type_to_glue_type_string(input_type: IcebergType) -> str: + if isinstance(input_type, BooleanType): + return "boolean" + if isinstance(input_type, IntegerType): + return "int" + if isinstance(input_type, LongType): + return "bigint" + if isinstance(input_type, FloatType): + return "float" + if isinstance(input_type, DoubleType): + return "double" + if isinstance(input_type, DateType): + return "date" + if isinstance( + input_type, + ( + TimeType, + StringType, + UUIDType, + ), + ): + return "string" + if isinstance(input_type, TimestampType): + return "timestamp" + if isinstance( + input_type, + ( + FixedType, + BinaryType, + ), + ): + return "binary" + if isinstance(input_type, DecimalType): + return f"decimal({input_type.precision},{input_type.scale})" + if isinstance(input_type, StructType): + name_to_type = ",".join(f"{f.name}:{_type_to_glue_type_string(f.field_type)}" for f in input_type.fields) + return f"struct<{name_to_type}>" + if isinstance(input_type, ListType): + return f"array<{_type_to_glue_type_string(input_type.element_type)}>" + if isinstance(input_type, MapType): + return f"map<{_type_to_glue_type_string(input_type.key_type)},{_type_to_glue_type_string(input_type.value_type)}>" + + raise ValueError(f"Unknown Type {input_type}") + + +def _to_columns(metadata: TableMetadataCommonFields) -> List[ColumnTypeDef]: + results: Dict[str, ColumnTypeDef] = {} + + def _append_to_results(field: NestedField, is_current: bool) -> None: + if field.name in results: + return + + results[field.name] = cast( + ColumnTypeDef, + { + "Name": field.name, + "Type": _type_to_glue_type_string(field.field_type), + "Parameters": { + ICEBERG_FIELD_ID: str(field.field_id), + ICEBERG_FIELD_OPTIONAL: str(field.optional).lower(), + ICEBERG_FIELD_CURRENT: str(is_current).lower(), + }, + }, + ) + if field.doc: + results[field.name]["Comment"] = field.doc + + if current_schema := metadata.schema_by_id(metadata.current_schema_id): + for field in current_schema.columns: + _append_to_results(field, True) + + for schema in metadata.schemas: + if schema.schema_id == metadata.current_schema_id: + continue + for field in schema.columns: + _append_to_results(field, False) Review Comment: I think these can be combined to: ```python for schema in metadata.schemas: for field in schema.columns: _append_to_results(field=field, is_current=schema.schema_id == metadata.current_schema_id) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org