gbrgr commented on code in PR #1824:
URL: https://github.com/apache/iceberg-rust/pull/1824#discussion_r2580834853
##########
crates/iceberg/src/arrow/record_batch_transformer.rs:
##########
@@ -539,83 +593,164 @@ impl RecordBatchTransformer {
prim_lit: &Option<PrimitiveLiteral>,
num_rows: usize,
) -> Result<ArrayRef> {
- Ok(match (target_type, prim_lit) {
- (DataType::Boolean, Some(PrimitiveLiteral::Boolean(value))) => {
- Arc::new(BooleanArray::from(vec![*value; num_rows]))
- }
- (DataType::Boolean, None) => {
- let vals: Vec<Option<bool>> = vec![None; num_rows];
- Arc::new(BooleanArray::from(vals))
- }
- (DataType::Int32, Some(PrimitiveLiteral::Int(value))) => {
- Arc::new(Int32Array::from(vec![*value; num_rows]))
- }
- (DataType::Int32, None) => {
- let vals: Vec<Option<i32>> = vec![None; num_rows];
- Arc::new(Int32Array::from(vals))
- }
- (DataType::Date32, Some(PrimitiveLiteral::Int(value))) => {
- Arc::new(Date32Array::from(vec![*value; num_rows]))
+ // All added columns use Run-End Encoding for memory efficiency
+ let DataType::RunEndEncoded(_, values_field) = target_type else {
+ return Err(Error::new(
+ ErrorKind::Unexpected,
+ format!(
+ "Expected RunEndEncoded type for added column, got: {}",
+ target_type
+ ),
+ ));
+ };
+
+ // Helper to create a Run-End Encoded array
+ let create_ree_array = |values_array: ArrayRef| -> Result<ArrayRef> {
+ let run_ends = if num_rows == 0 {
+ Int32Array::from(Vec::<i32>::new())
+ } else {
+ Int32Array::from(vec![num_rows as i32])
+ };
+ Ok(Arc::new(
+ RunArray::try_new(&run_ends, &values_array).map_err(|e| {
+ Error::new(
+ ErrorKind::Unexpected,
+ "Failed to create RunArray for constant value",
+ )
+ .with_source(e)
+ })?,
+ ))
+ };
+
+ // Create the values array based on the literal value
+ let values_array: ArrayRef = match (values_field.data_type(),
prim_lit) {
+ (DataType::Boolean, Some(PrimitiveLiteral::Boolean(v))) => {
+ Arc::new(BooleanArray::from(vec![*v]))
}
- (DataType::Date32, None) => {
- let vals: Vec<Option<i32>> = vec![None; num_rows];
- Arc::new(Date32Array::from(vals))
+ (DataType::Boolean, None) =>
Arc::new(BooleanArray::from(vec![Option::<bool>::None])),
+ (DataType::Int32, Some(PrimitiveLiteral::Int(v))) => {
+ Arc::new(Int32Array::from(vec![*v]))
}
- (DataType::Int64, Some(PrimitiveLiteral::Long(value))) => {
- Arc::new(Int64Array::from(vec![*value; num_rows]))
+ (DataType::Int32, None) =>
Arc::new(Int32Array::from(vec![Option::<i32>::None])),
+ (DataType::Date32, Some(PrimitiveLiteral::Int(v))) => {
+ Arc::new(Date32Array::from(vec![*v]))
}
- (DataType::Int64, None) => {
- let vals: Vec<Option<i64>> = vec![None; num_rows];
- Arc::new(Int64Array::from(vals))
+ (DataType::Date32, None) =>
Arc::new(Date32Array::from(vec![Option::<i32>::None])),
+ (DataType::Int64, Some(PrimitiveLiteral::Long(v))) => {
+ Arc::new(Int64Array::from(vec![*v]))
}
- (DataType::Float32, Some(PrimitiveLiteral::Float(value))) => {
- Arc::new(Float32Array::from(vec![value.0; num_rows]))
+ (DataType::Int64, None) =>
Arc::new(Int64Array::from(vec![Option::<i64>::None])),
+ (DataType::Float32, Some(PrimitiveLiteral::Float(v))) => {
+ Arc::new(Float32Array::from(vec![v.0]))
}
- (DataType::Float32, None) => {
- let vals: Vec<Option<f32>> = vec![None; num_rows];
- Arc::new(Float32Array::from(vals))
+ (DataType::Float32, None) =>
Arc::new(Float32Array::from(vec![Option::<f32>::None])),
+ (DataType::Float64, Some(PrimitiveLiteral::Double(v))) => {
+ Arc::new(Float64Array::from(vec![v.0]))
}
- (DataType::Float64, Some(PrimitiveLiteral::Double(value))) => {
- Arc::new(Float64Array::from(vec![value.0; num_rows]))
+ (DataType::Float64, None) =>
Arc::new(Float64Array::from(vec![Option::<f64>::None])),
+ (DataType::Utf8, Some(PrimitiveLiteral::String(v))) => {
+ Arc::new(StringArray::from(vec![v.as_str()]))
}
- (DataType::Float64, None) => {
- let vals: Vec<Option<f64>> = vec![None; num_rows];
- Arc::new(Float64Array::from(vals))
+ (DataType::Utf8, None) =>
Arc::new(StringArray::from(vec![Option::<&str>::None])),
+ (DataType::Binary, Some(PrimitiveLiteral::Binary(v))) => {
+ Arc::new(BinaryArray::from_vec(vec![v.as_slice()]))
}
- (DataType::Utf8, Some(PrimitiveLiteral::String(value))) => {
- Arc::new(StringArray::from(vec![value.clone(); num_rows]))
+ (DataType::Binary, None) => {
+
Arc::new(BinaryArray::from_opt_vec(vec![Option::<&[u8]>::None]))
}
- (DataType::Utf8, None) => {
- let vals: Vec<Option<String>> = vec![None; num_rows];
- Arc::new(StringArray::from(vals))
+ (DataType::Decimal128(_, _), Some(PrimitiveLiteral::Int128(v))) =>
{
+ Arc::new(arrow_array::Decimal128Array::from(vec![{ *v }]))
}
- (DataType::Binary, Some(PrimitiveLiteral::Binary(value))) => {
- Arc::new(BinaryArray::from_vec(vec![value; num_rows]))
+ (DataType::Decimal128(_, _), Some(PrimitiveLiteral::UInt128(v)))
=> {
+ Arc::new(arrow_array::Decimal128Array::from(vec![*v as i128]))
}
- (DataType::Binary, None) => {
- let vals: Vec<Option<&[u8]>> = vec![None; num_rows];
- Arc::new(BinaryArray::from_opt_vec(vals))
+ (DataType::Decimal128(_, _), None) => {
+ Arc::new(arrow_array::Decimal128Array::from(vec![
+ Option::<i128>::None,
+ ]))
}
(DataType::Struct(fields), None) => {
- // Create a StructArray filled with nulls. Per Iceberg spec,
optional struct fields
- // default to null when added to the schema. We defer non-null
default struct values
- // and leave them as not implemented yet.
+ // Create a single-element StructArray with nulls
let null_arrays: Vec<ArrayRef> = fields
.iter()
- .map(|field| Self::create_column(field.data_type(), &None,
num_rows))
- .collect::<Result<Vec<_>>>()?;
-
- Arc::new(StructArray::new(
+ .map(|f| {
+ // Recursively create null arrays for struct fields
+ // For primitive fields in structs, use simple null
arrays (not REE within struct)
+ match f.data_type() {
+ DataType::Boolean => {
+
Arc::new(BooleanArray::from(vec![Option::<bool>::None])) as ArrayRef
+ }
+ DataType::Int32 | DataType::Date32 => {
+
Arc::new(Int32Array::from(vec![Option::<i32>::None]))
+ }
+ DataType::Int64 => {
+
Arc::new(Int64Array::from(vec![Option::<i64>::None]))
+ }
+ DataType::Float32 => {
+
Arc::new(Float32Array::from(vec![Option::<f32>::None]))
+ }
+ DataType::Float64 => {
+
Arc::new(Float64Array::from(vec![Option::<f64>::None]))
+ }
+ DataType::Utf8 => {
+
Arc::new(StringArray::from(vec![Option::<&str>::None]))
+ }
+ DataType::Binary => {
+
Arc::new(BinaryArray::from_opt_vec(vec![Option::<&[u8]>::None]))
+ }
+ _ => panic!("Unsupported struct field type: {:?}",
f.data_type()),
+ }
+ })
+ .collect();
+ Arc::new(arrow_array::StructArray::new(
fields.clone(),
null_arrays,
- Some(NullBuffer::new_null(num_rows)),
+ Some(arrow_buffer::NullBuffer::new_null(1)),
))
}
- (DataType::Null, _) => Arc::new(NullArray::new(num_rows)),
- (dt, _) => {
+ _ => {
+ return Err(Error::new(
+ ErrorKind::Unexpected,
+ format!(
+ "Unsupported constant type combination: {:?} with
{:?}",
+ values_field.data_type(),
+ prim_lit
+ ),
+ ));
+ }
+ };
+
+ // Wrap in Run-End Encoding
+ create_ree_array(values_array)
+ }
+
+ /// Converts a PrimitiveLiteral to its corresponding Arrow DataType.
+ /// This is used for constant fields to determine the Arrow type.
+ /// For constant values, we use Run-End Encoding for all types to save
memory.
+ fn primitive_literal_to_arrow_type(literal: &PrimitiveLiteral) ->
Result<DataType> {
+ // Helper to create REE type with the given values type
+ // Note: values field is nullable as Arrow expects this when building
the
+ // final Arrow schema with `RunArray::try_new`.
+ let make_ree = |values_type: DataType| -> DataType {
Review Comment:
I moved it to schema.rs as a single helper method.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]