geruh commented on code in PR #2410:
URL: https://github.com/apache/iceberg-python/pull/2410#discussion_r2725980121
##########
pyiceberg/partitioning.py:
##########
@@ -249,6 +250,36 @@ def partition_to_path(self, data: Record, schema: Schema)
-> str:
path = "/".join([field_str + "=" + value_str for field_str, value_str
in zip(field_strs, value_strs, strict=True)])
return path
+ def check_compatible(self, schema: Schema, allow_missing_fields: bool =
False) -> None:
+ # if the underlying field is dropped, we cannot check they are
compatible -- continue
+ schema_fields = schema._lazy_id_to_field
+ parents = _index_parents(schema)
+
+ def validate_parents_are_structs(field_id: int) -> None:
+ parent_id = parents.get(field_id)
+ while parent_id:
+ parent_type = schema.find_type(parent_id)
+ if not parent_type.is_struct:
+ raise ValidationError("Invalid partition field parent:
%s", parent_type)
+ parent_id = parents.get(parent_id)
+
+ for field in self.fields:
+ source_field = schema_fields.get(field.source_id)
+ if allow_missing_fields and source_field:
+ continue
+
+ if not isinstance(field.transform, VoidTransform):
+ if source_field:
+ source_type = source_field.field_type
+ if not source_type.is_primitive:
+ raise ValidationError(f"Cannot partition by
non-primitive source field: {source_type}")
+ if not field.transform.can_transform(source_type):
+ raise ValidationError(f"Invalid source type
{source_type} for transform: {field.transform}")
+ # The only valid parent types for a PartitionField are
StructTypes. This must be checked recursively
+ validate_parents_are_structs(field.source_id)
+ else:
+ raise ValidationError(f"Cannot find source column for
partition field: {field}")
Review Comment:
We can simplify the logic here:
- compute the id to fiel logic once instead of indexing for each field.
- align the`allow_missing_fields` logic with java implementation
```suggestion
def check_compatible(self, schema: Schema, allow_missing_fields: bool =
False) -> None:
for field in self.fields:
if isinstance(field.transform, VoidTransform):
continue
source_field = schema._lazy_id_to_field.get(field.source_id)
if source_field is None:
if allow_missing_fields:
continue
raise ValidationError(f"Cannot find source column for
partition field: {field}")
source_type = source_field.field_type
if not source_type.is_primitive:
raise ValidationError(f"Cannot partition by non-primitive
source field: {source_type}")
if not field.transform.can_transform(source_type):
raise ValidationError(f"Invalid source type {source_type}
for transform: {field.transform}")
parent_id = schema._lazy_id_to_parent.get(field.source_id)
while parent_id is not None:
parent_type = schema.find_type(parent_id)
if not parent_type.is_struct:
raise ValidationError(f"Cannot partition by field within
non-struct parent: {parent_type}")
parent_id = schema._lazy_id_to_parent.get(parent_id)
```
##########
pyiceberg/table/sorting.py:
##########
@@ -169,6 +170,17 @@ def __repr__(self) -> str:
fields = f"{', '.join(repr(column) for column in self.fields)}, " if
self.fields else ""
return f"SortOrder({fields}order_id={self.order_id})"
+ def check_compatible(self, schema: Schema) -> None:
+ schema_ids = schema._lazy_id_to_field
+ for field in self.fields:
+ if source_field := schema_ids.get(field.source_id):
+ if not source_field.field_type.is_primitive:
+ raise ValidationError(f"Cannot sort by non-primitive
source field: {source_field}")
+ if not field.transform.can_transform(source_field.field_type):
+ raise ValidationError(f"Invalid source type
{source_field.field_type} for transform: {field.transform}")
+ else:
+ raise ValidationError(f"Cannot find source column for sort
field: {field}")
Review Comment:
Nit: We can flatten the conditional logic, and `_lazy_id_to_parent` is
already computed
```suggestion
def check_compatible(self, schema: Schema) -> None:
for field in self.fields:
source_field = schema._lazy_id_to_field.get(field.source_id)
if source_field is None:
raise ValidationError(f"Cannot find source column for sort
field: {field}")
if not source_field.field_type.is_primitive:
raise ValidationError(f"Cannot sort by non-primitive source
field: {source_field}")
if not field.transform.can_transform(source_field.field_type):
raise ValidationError(f"Invalid source type
{source_field.field_type} for transform: {field.transform}")
```
##########
pyiceberg/partitioning.py:
##########
@@ -32,7 +32,8 @@
model_validator,
)
-from pyiceberg.schema import Schema
+from pyiceberg.exceptions import ValidationError
+from pyiceberg.schema import Schema, _index_parents
Review Comment:
```suggestion
from pyiceberg.schema import Schema
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]