mbutrovich commented on code in PR #1821:
URL: https://github.com/apache/iceberg-rust/pull/1821#discussion_r2510873033
##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -123,12 +166,55 @@ impl RecordBatchTransformer {
pub(crate) fn build(
snapshot_schema: Arc<IcebergSchema>,
projected_iceberg_field_ids: &[i32],
+ ) -> Self {
+ Self::build_with_partition_data(
+ snapshot_schema,
+ projected_iceberg_field_ids,
+ None,
+ None,
+ None,
+ )
+ }
+
+ /// Build a RecordBatchTransformer with partition spec and data for proper
constant identification.
+ ///
+ /// Implements the Iceberg spec's "Column Projection" rules for resolving
field IDs "not present" in data files:
+ /// 1. Return the value from partition metadata if an Identity Transform
exists
+ /// 2. Use schema.name-mapping.default metadata to map field id to columns
without field id
+ /// 3. Return the default value if it has a defined initial-default
+ /// 4. Return null in all other cases
+ ///
+ /// # Why this method exists
+ ///
+ /// 1. **Bucket partitioning**: Distinguish identity transforms (use
partition metadata constants)
+ /// from non-identity transforms like bucket (read from data file) to
enable runtime filtering on
+ /// bucket-partitioned columns.
+ ///
+ /// 2. **Add_files field ID conflicts**: When importing Hive tables,
partition columns can have field IDs
+ /// conflicting with Parquet data columns (e.g., Parquet has
field_id=1->"name", but Iceberg expects
+ /// field_id=1->"id"). Per spec, such fields are "not present" and
should use name mapping (rule #2).
+ ///
+ /// This matches Java's ParquetSchemaUtil.applyNameMapping approach but
detects conflicts during projection.
+ ///
+ /// # References
+ /// - Spec: https://iceberg.apache.org/spec/#column-projection
+ /// - Java: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java
+ /// - Java:
parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java
+ pub(crate) fn build_with_partition_data(
Review Comment:
Updated.
##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -29,9 +29,45 @@ use arrow_schema::{
use parquet::arrow::PARQUET_FIELD_ID_META_KEY;
use crate::arrow::schema_to_arrow_schema;
-use crate::spec::{Literal, PrimitiveLiteral, Schema as IcebergSchema};
+use crate::spec::{
+ Literal, NameMapping, PartitionSpec, PrimitiveLiteral, Schema as
IcebergSchema, Struct,
+ Transform,
+};
use crate::{Error, ErrorKind, Result};
+/// Build a map of field ID to constant value for identity-partitioned fields.
+///
+/// Implements Iceberg spec "Column Projection" rule #1: use partition
metadata constants
+/// only for identity-transformed fields. Non-identity transforms (bucket,
truncate, year, etc.)
+/// store derived values in partition metadata, so source columns must be read
from data files.
+///
+/// Example: For `bucket(4, id)`, partition metadata has `id_bucket = 2`
(bucket number),
+/// but the actual `id` values (100, 200, 300) are only in the data file.
+///
+/// Matches Java's `PartitionUtil.constantsMap()` which filters `if
(field.transform().isIdentity())`.
+///
+/// # References
+/// - Spec: https://iceberg.apache.org/spec/#column-projection
+/// - Java:
core/src/main/java/org/apache/iceberg/util/PartitionUtil.java:constantsMap()
+fn constants_map(
+ partition_spec: &PartitionSpec,
+ partition_data: &Struct,
+) -> HashMap<i32, PrimitiveLiteral> {
+ let mut constants = HashMap::new();
+
+ for (pos, field) in partition_spec.fields().iter().enumerate() {
+ // Only identity transforms should use constant values from partition
metadata
+ if matches!(field.transform, Transform::Identity) {
+ // Get the partition value for this field
+ if let Some(Some(Literal::Primitive(value))) =
partition_data.iter().nth(pos) {
Review Comment:
Updated.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]