liurenjie1024 commented on code in PR #261: URL: https://github.com/apache/iceberg-rust/pull/261#discussion_r1531709362
########## crates/iceberg/src/spec/schema.rs: ########## @@ -1338,4 +1533,430 @@ table { ); } } + #[test] + fn test_schema_prune_columns_string() { + let expected_schema = Type::Struct(StructType::new(vec ########## crates/iceberg/src/spec/schema.rs: ########## @@ -642,6 +644,199 @@ impl SchemaVisitor for IndexByName { } } +struct PruneColumn { + selected: HashSet<i32>, + select_full_types: bool, +} + +impl PruneColumn { + fn new(selected: HashSet<i32>, select_full_types: bool) -> Self { + Self { + selected, + select_full_types, + } + } + + fn project_selected_struct(projected_field: Option<Type>) -> Result<StructType> { + match projected_field { + // If the field is a StructType, return it as such + Some(field) if field.is_struct() => Ok(field.as_struct_type().unwrap_or_default()), + // If projected_field is None or not a StructType, return an empty StructType + _ => Ok(StructType::default()), + } + } + fn project_list(list: &ListType, result: Option<Type>) -> Result<ListType> { + if let Some(value_type) = result { + if *list.element_field.field_type == value_type { + return Ok(list.clone()); + } + Ok(ListType { + element_field: Arc::new(NestedField { + id: list.element_field.id, + name: list.element_field.name.clone(), + required: list.element_field.required, + field_type: Box::new(value_type), + doc: list.element_field.doc.clone(), + initial_default: list.element_field.initial_default.clone(), + write_default: list.element_field.write_default.clone(), + }), + }) + } else { + Err(Error::new(ErrorKind::DataInvalid, "Type value is none")) + } + } + fn project_map(map: &MapType, value: Option<Type>) -> Result<MapType> { + if let Some(value_type) = value { + if *map.value_field.field_type == value_type { + return Ok(map.clone()); + } + Ok(MapType { + key_field: map.key_field.clone(), + value_field: Arc::new(NestedField { + id: map.value_field.id, + name: map.value_field.name.clone(), + required: map.value_field.required, + field_type: Box::new(value_type), + doc: map.value_field.doc.clone(), + initial_default: map.value_field.initial_default.clone(), + write_default: map.value_field.write_default.clone(), + }), + }) + } else { + Err(Error::new(ErrorKind::DataInvalid, "Type value is none")) + } + } +} + +impl SchemaVisitor for PruneColumn { + type T = Option<Type>; + + fn schema(&mut self, _schema: &Schema, value: Self::T) -> Result<Self::T> { + Ok(Some(value.unwrap())) + } + + fn field(&mut self, field: &NestedFieldRef, value: Self::T) -> Result<Self::T> { + if self.selected.contains(&field.id) { + if self.select_full_types { + Ok(Some(*field.field_type.clone())) + } else if field.field_type.is_struct() { + return Ok(Some(Type::Struct(PruneColumn::project_selected_struct( + value, + )?))); + } else if !field.field_type.is_nested() { + return Ok(Some(*field.field_type.clone())); + } else { + return Err(Error::new( + ErrorKind::DataInvalid, + "Nested field types not allowed in this context".to_string(), + )); Review Comment: ```suggestion ).with_context("field_id", field_id) .with_context("field_type", field_type) ); ``` ########## crates/iceberg/src/spec/schema.rs: ########## @@ -642,6 +644,199 @@ impl SchemaVisitor for IndexByName { } } +struct PruneColumn { + selected: HashSet<i32>, + select_full_types: bool, +} + +impl PruneColumn { + fn new(selected: HashSet<i32>, select_full_types: bool) -> Self { Review Comment: For the dead code warning, will we expose a pub function like this helps: ```rust pub fn prune_columns(schema: &Schema, ids: HashSet<i32>, select_full_type: bool) -> Result<Type> { ... } ``` ########## crates/iceberg/src/spec/schema.rs: ########## @@ -642,6 +644,199 @@ impl SchemaVisitor for IndexByName { } } +struct PruneColumn { + selected: HashSet<i32>, + select_full_types: bool, +} + +impl PruneColumn { + fn new(selected: HashSet<i32>, select_full_types: bool) -> Self { + Self { + selected, + select_full_types, + } + } + + fn project_selected_struct(projected_field: Option<Type>) -> Result<StructType> { + match projected_field { + // If the field is a StructType, return it as such + Some(field) if field.is_struct() => Ok(field.as_struct_type().unwrap_or_default()), + // If projected_field is None or not a StructType, return an empty StructType + _ => Ok(StructType::default()), + } + } + fn project_list(list: &ListType, result: Option<Type>) -> Result<ListType> { Review Comment: We don't need to pass `Option` here? ########## crates/iceberg/src/spec/schema.rs: ########## @@ -642,6 +644,199 @@ impl SchemaVisitor for IndexByName { } } +struct PruneColumn { + selected: HashSet<i32>, + select_full_types: bool, +} + +impl PruneColumn { + fn new(selected: HashSet<i32>, select_full_types: bool) -> Self { + Self { + selected, + select_full_types, + } + } + + fn project_selected_struct(projected_field: Option<Type>) -> Result<StructType> { + match projected_field { + // If the field is a StructType, return it as such + Some(field) if field.is_struct() => Ok(field.as_struct_type().unwrap_or_default()), Review Comment: Do we really need `is_struct`? How about writing like this: ```rust match field { Some(Type::Struct(s)) => Ok(s), Some(_) => return error; None => None } ########## crates/iceberg/src/spec/schema.rs: ########## @@ -642,6 +644,199 @@ impl SchemaVisitor for IndexByName { } } +struct PruneColumn { + selected: HashSet<i32>, + select_full_types: bool, +} + +impl PruneColumn { + fn new(selected: HashSet<i32>, select_full_types: bool) -> Self { + Self { + selected, + select_full_types, + } + } + + fn project_selected_struct(projected_field: Option<Type>) -> Result<StructType> { + match projected_field { + // If the field is a StructType, return it as such + Some(field) if field.is_struct() => Ok(field.as_struct_type().unwrap_or_default()), + // If projected_field is None or not a StructType, return an empty StructType + _ => Ok(StructType::default()), + } + } + fn project_list(list: &ListType, result: Option<Type>) -> Result<ListType> { + if let Some(value_type) = result { + if *list.element_field.field_type == value_type { + return Ok(list.clone()); + } + Ok(ListType { + element_field: Arc::new(NestedField { + id: list.element_field.id, + name: list.element_field.name.clone(), + required: list.element_field.required, + field_type: Box::new(value_type), + doc: list.element_field.doc.clone(), + initial_default: list.element_field.initial_default.clone(), + write_default: list.element_field.write_default.clone(), + }), + }) + } else { + Err(Error::new(ErrorKind::DataInvalid, "Type value is none")) + } + } + fn project_map(map: &MapType, value: Option<Type>) -> Result<MapType> { + if let Some(value_type) = value { + if *map.value_field.field_type == value_type { + return Ok(map.clone()); + } + Ok(MapType { + key_field: map.key_field.clone(), + value_field: Arc::new(NestedField { + id: map.value_field.id, + name: map.value_field.name.clone(), + required: map.value_field.required, + field_type: Box::new(value_type), + doc: map.value_field.doc.clone(), + initial_default: map.value_field.initial_default.clone(), + write_default: map.value_field.write_default.clone(), + }), + }) + } else { + Err(Error::new(ErrorKind::DataInvalid, "Type value is none")) + } + } +} + +impl SchemaVisitor for PruneColumn { + type T = Option<Type>; + + fn schema(&mut self, _schema: &Schema, value: Self::T) -> Result<Self::T> { + Ok(Some(value.unwrap())) + } + + fn field(&mut self, field: &NestedFieldRef, value: Self::T) -> Result<Self::T> { + if self.selected.contains(&field.id) { + if self.select_full_types { + Ok(Some(*field.field_type.clone())) + } else if field.field_type.is_struct() { + return Ok(Some(Type::Struct(PruneColumn::project_selected_struct( + value, + )?))); + } else if !field.field_type.is_nested() { + return Ok(Some(*field.field_type.clone())); + } else { + return Err(Error::new( + ErrorKind::DataInvalid, + "Nested field types not allowed in this context".to_string(), + )); + } + } else { + Ok(value) + } + } + + fn r#struct(&mut self, r#struct: &StructType, results: Vec<Self::T>) -> Result<Self::T> { + let fields = r#struct.fields(); + let mut selected_field = Vec::with_capacity(fields.len()); + let mut same_type = true; + + for i in 0..results.len() { Review Comment: We can replace with `https://docs.rs/itertools/latest/itertools/fn.zip_eq.html` to make code easier to read. ########## crates/iceberg/src/spec/schema.rs: ########## @@ -642,6 +644,199 @@ impl SchemaVisitor for IndexByName { } } +struct PruneColumn { + selected: HashSet<i32>, + select_full_types: bool, +} + +impl PruneColumn { + fn new(selected: HashSet<i32>, select_full_types: bool) -> Self { + Self { + selected, + select_full_types, + } + } + + fn project_selected_struct(projected_field: Option<Type>) -> Result<StructType> { + match projected_field { + // If the field is a StructType, return it as such + Some(field) if field.is_struct() => Ok(field.as_struct_type().unwrap_or_default()), + // If projected_field is None or not a StructType, return an empty StructType + _ => Ok(StructType::default()), + } + } + fn project_list(list: &ListType, result: Option<Type>) -> Result<ListType> { + if let Some(value_type) = result { + if *list.element_field.field_type == value_type { + return Ok(list.clone()); + } + Ok(ListType { + element_field: Arc::new(NestedField { + id: list.element_field.id, + name: list.element_field.name.clone(), + required: list.element_field.required, + field_type: Box::new(value_type), + doc: list.element_field.doc.clone(), + initial_default: list.element_field.initial_default.clone(), + write_default: list.element_field.write_default.clone(), + }), + }) + } else { + Err(Error::new(ErrorKind::DataInvalid, "Type value is none")) + } + } + fn project_map(map: &MapType, value: Option<Type>) -> Result<MapType> { Review Comment: Ditto. ########## crates/iceberg/src/spec/schema.rs: ########## @@ -642,6 +644,199 @@ impl SchemaVisitor for IndexByName { } } +struct PruneColumn { + selected: HashSet<i32>, + select_full_types: bool, +} + +impl PruneColumn { + fn new(selected: HashSet<i32>, select_full_types: bool) -> Self { + Self { + selected, + select_full_types, + } + } + + fn project_selected_struct(projected_field: Option<Type>) -> Result<StructType> { + match projected_field { + // If the field is a StructType, return it as such + Some(field) if field.is_struct() => Ok(field.as_struct_type().unwrap_or_default()), + // If projected_field is None or not a StructType, return an empty StructType + _ => Ok(StructType::default()), + } + } + fn project_list(list: &ListType, result: Option<Type>) -> Result<ListType> { + if let Some(value_type) = result { + if *list.element_field.field_type == value_type { + return Ok(list.clone()); + } + Ok(ListType { + element_field: Arc::new(NestedField { + id: list.element_field.id, + name: list.element_field.name.clone(), + required: list.element_field.required, + field_type: Box::new(value_type), + doc: list.element_field.doc.clone(), + initial_default: list.element_field.initial_default.clone(), + write_default: list.element_field.write_default.clone(), + }), + }) + } else { + Err(Error::new(ErrorKind::DataInvalid, "Type value is none")) + } + } + fn project_map(map: &MapType, value: Option<Type>) -> Result<MapType> { + if let Some(value_type) = value { + if *map.value_field.field_type == value_type { + return Ok(map.clone()); + } + Ok(MapType { + key_field: map.key_field.clone(), + value_field: Arc::new(NestedField { + id: map.value_field.id, + name: map.value_field.name.clone(), + required: map.value_field.required, + field_type: Box::new(value_type), + doc: map.value_field.doc.clone(), + initial_default: map.value_field.initial_default.clone(), + write_default: map.value_field.write_default.clone(), + }), + }) + } else { + Err(Error::new(ErrorKind::DataInvalid, "Type value is none")) + } + } +} + +impl SchemaVisitor for PruneColumn { + type T = Option<Type>; + + fn schema(&mut self, _schema: &Schema, value: Self::T) -> Result<Self::T> { Review Comment: ```suggestion fn schema(&mut self, _schema: &Schema, value: Option<Type>) -> Result<Self::T> { ``` Though it's legal, it's better to use concrete type to improve readability. ########## crates/iceberg/src/spec/schema.rs: ########## @@ -642,6 +644,199 @@ impl SchemaVisitor for IndexByName { } } +struct PruneColumn { + selected: HashSet<i32>, + select_full_types: bool, +} + +impl PruneColumn { + fn new(selected: HashSet<i32>, select_full_types: bool) -> Self { + Self { + selected, + select_full_types, + } + } + + fn project_selected_struct(projected_field: Option<Type>) -> Result<StructType> { + match projected_field { + // If the field is a StructType, return it as such + Some(field) if field.is_struct() => Ok(field.as_struct_type().unwrap_or_default()), + // If projected_field is None or not a StructType, return an empty StructType + _ => Ok(StructType::default()), + } + } + fn project_list(list: &ListType, result: Option<Type>) -> Result<ListType> { + if let Some(value_type) = result { + if *list.element_field.field_type == value_type { + return Ok(list.clone()); + } + Ok(ListType { + element_field: Arc::new(NestedField { + id: list.element_field.id, + name: list.element_field.name.clone(), + required: list.element_field.required, + field_type: Box::new(value_type), + doc: list.element_field.doc.clone(), + initial_default: list.element_field.initial_default.clone(), + write_default: list.element_field.write_default.clone(), + }), + }) + } else { + Err(Error::new(ErrorKind::DataInvalid, "Type value is none")) + } + } + fn project_map(map: &MapType, value: Option<Type>) -> Result<MapType> { + if let Some(value_type) = value { + if *map.value_field.field_type == value_type { + return Ok(map.clone()); + } + Ok(MapType { + key_field: map.key_field.clone(), + value_field: Arc::new(NestedField { + id: map.value_field.id, + name: map.value_field.name.clone(), + required: map.value_field.required, + field_type: Box::new(value_type), + doc: map.value_field.doc.clone(), + initial_default: map.value_field.initial_default.clone(), + write_default: map.value_field.write_default.clone(), + }), + }) + } else { + Err(Error::new(ErrorKind::DataInvalid, "Type value is none")) + } + } +} + +impl SchemaVisitor for PruneColumn { + type T = Option<Type>; + + fn schema(&mut self, _schema: &Schema, value: Self::T) -> Result<Self::T> { + Ok(Some(value.unwrap())) + } + + fn field(&mut self, field: &NestedFieldRef, value: Self::T) -> Result<Self::T> { + if self.selected.contains(&field.id) { + if self.select_full_types { + Ok(Some(*field.field_type.clone())) + } else if field.field_type.is_struct() { + return Ok(Some(Type::Struct(PruneColumn::project_selected_struct( + value, + )?))); + } else if !field.field_type.is_nested() { + return Ok(Some(*field.field_type.clone())); + } else { + return Err(Error::new( + ErrorKind::DataInvalid, + "Nested field types not allowed in this context".to_string(), Review Comment: ```suggestion "Can't project list or map field directly when not selecting full type.".to_string(), ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org