mbutrovich commented on code in PR #1821:
URL: https://github.com/apache/iceberg-rust/pull/1821#discussion_r2510873033


##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -123,12 +166,55 @@ impl RecordBatchTransformer {
     pub(crate) fn build(
         snapshot_schema: Arc<IcebergSchema>,
         projected_iceberg_field_ids: &[i32],
+    ) -> Self {
+        Self::build_with_partition_data(
+            snapshot_schema,
+            projected_iceberg_field_ids,
+            None,
+            None,
+            None,
+        )
+    }
+
+    /// Build a RecordBatchTransformer with partition spec and data for proper 
constant identification.
+    ///
+    /// Implements the Iceberg spec's "Column Projection" rules for resolving 
field IDs "not present" in data files:
+    /// 1. Return the value from partition metadata if an Identity Transform 
exists
+    /// 2. Use schema.name-mapping.default metadata to map field id to columns 
without field id
+    /// 3. Return the default value if it has a defined initial-default
+    /// 4. Return null in all other cases
+    ///
+    /// # Why this method exists
+    ///
+    /// 1. **Bucket partitioning**: Distinguish identity transforms (use 
partition metadata constants)
+    ///    from non-identity transforms like bucket (read from data file) to 
enable runtime filtering on
+    ///    bucket-partitioned columns.
+    ///
+    /// 2. **Add_files field ID conflicts**: When importing Hive tables, 
partition columns can have field IDs
+    ///    conflicting with Parquet data columns (e.g., Parquet has 
field_id=1->"name", but Iceberg expects
+    ///    field_id=1->"id"). Per spec, such fields are "not present" and 
should use name mapping (rule #2).
+    ///
+    /// This matches Java's ParquetSchemaUtil.applyNameMapping approach but 
detects conflicts during projection.
+    ///
+    /// # References
+    /// - Spec: https://iceberg.apache.org/spec/#column-projection
+    /// - Java: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java
+    /// - Java: 
parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java
+    pub(crate) fn build_with_partition_data(

Review Comment:
   Updated.



##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -29,9 +29,45 @@ use arrow_schema::{
 use parquet::arrow::PARQUET_FIELD_ID_META_KEY;
 
 use crate::arrow::schema_to_arrow_schema;
-use crate::spec::{Literal, PrimitiveLiteral, Schema as IcebergSchema};
+use crate::spec::{
+    Literal, NameMapping, PartitionSpec, PrimitiveLiteral, Schema as 
IcebergSchema, Struct,
+    Transform,
+};
 use crate::{Error, ErrorKind, Result};
 
+/// Build a map of field ID to constant value for identity-partitioned fields.
+///
+/// Implements Iceberg spec "Column Projection" rule #1: use partition 
metadata constants
+/// only for identity-transformed fields. Non-identity transforms (bucket, 
truncate, year, etc.)
+/// store derived values in partition metadata, so source columns must be read 
from data files.
+///
+/// Example: For `bucket(4, id)`, partition metadata has `id_bucket = 2` 
(bucket number),
+/// but the actual `id` values (100, 200, 300) are only in the data file.
+///
+/// Matches Java's `PartitionUtil.constantsMap()` which filters `if 
(field.transform().isIdentity())`.
+///
+/// # References
+/// - Spec: https://iceberg.apache.org/spec/#column-projection
+/// - Java: 
core/src/main/java/org/apache/iceberg/util/PartitionUtil.java:constantsMap()
+fn constants_map(
+    partition_spec: &PartitionSpec,
+    partition_data: &Struct,
+) -> HashMap<i32, PrimitiveLiteral> {
+    let mut constants = HashMap::new();
+
+    for (pos, field) in partition_spec.fields().iter().enumerate() {
+        // Only identity transforms should use constant values from partition 
metadata
+        if matches!(field.transform, Transform::Identity) {
+            // Get the partition value for this field
+            if let Some(Some(Literal::Primitive(value))) = 
partition_data.iter().nth(pos) {

Review Comment:
   Updated.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to