This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new d0fa24e0e4 [Variant] Impl `PartialEq` for VariantObject (#7943)
d0fa24e0e4 is described below
commit d0fa24e0e44d3a572624618b5a9a8d04d82924ed
Author: Matthew Kim <[email protected]>
AuthorDate: Thu Jul 17 17:38:53 2025 +0200
[Variant] Impl `PartialEq` for VariantObject (#7943)
# Rationale for this change
- Closes https://github.com/apache/arrow-rs/issues/7948
This PR introduces a custom implementation of `PartialEq` for variant
objects.
According to the spec, field values are not required to be in the same
order as the field IDs, to enable flexibility when constructing Variant
values.
Instead of comparing the raw bytes of 2 variant objects, this
implementation recursively checks whether the field values are equal --
regardless of their order
---
parquet-variant/src/builder.rs | 111 +++++++++++-----
parquet-variant/src/variant/metadata.rs | 29 ++++-
parquet-variant/src/variant/object.rs | 219 +++++++++++++++++++++++++++++++-
3 files changed, 325 insertions(+), 34 deletions(-)
diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs
index ae82cfec9d..73fa15255e 100644
--- a/parquet-variant/src/builder.rs
+++ b/parquet-variant/src/builder.rs
@@ -16,11 +16,12 @@
// under the License.
use crate::decoder::{VariantBasicType, VariantPrimitiveType};
use crate::{
- ShortString, Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8,
VariantMetadata,
+ ShortString, Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8,
VariantList,
+ VariantMetadata, VariantObject,
};
use arrow_schema::ArrowError;
use indexmap::{IndexMap, IndexSet};
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
const BASIC_TYPE_BITS: u8 = 2;
const UNIX_EPOCH_DATE: chrono::NaiveDate =
chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
@@ -216,6 +217,57 @@ impl ValueBuffer {
self.append_slice(value.as_bytes());
}
+ fn append_object(&mut self, metadata_builder: &mut MetadataBuilder, obj:
VariantObject) {
+ let mut object_builder = self.new_object(metadata_builder);
+
+ for (field_name, value) in obj.iter() {
+ object_builder.insert(field_name, value);
+ }
+
+ object_builder.finish().unwrap();
+ }
+
+ fn try_append_object(
+ &mut self,
+ metadata_builder: &mut MetadataBuilder,
+ obj: VariantObject,
+ ) -> Result<(), ArrowError> {
+ let mut object_builder = self.new_object(metadata_builder);
+
+ for res in obj.iter_try() {
+ let (field_name, value) = res?;
+ object_builder.try_insert(field_name, value)?;
+ }
+
+ object_builder.finish()?;
+
+ Ok(())
+ }
+
+ fn append_list(&mut self, metadata_builder: &mut MetadataBuilder, list:
VariantList) {
+ let mut list_builder = self.new_list(metadata_builder);
+ for value in list.iter() {
+ list_builder.append_value(value);
+ }
+ list_builder.finish();
+ }
+
+ fn try_append_list(
+ &mut self,
+ metadata_builder: &mut MetadataBuilder,
+ list: VariantList,
+ ) -> Result<(), ArrowError> {
+ let mut list_builder = self.new_list(metadata_builder);
+ for res in list.iter_try() {
+ let value = res?;
+ list_builder.try_append_value(value)?;
+ }
+
+ list_builder.finish();
+
+ Ok(())
+ }
+
fn offset(&self) -> usize {
self.0.len()
}
@@ -252,9 +304,31 @@ impl ValueBuffer {
variant: Variant<'m, 'd>,
metadata_builder: &mut MetadataBuilder,
) {
- self.try_append_variant(variant, metadata_builder).unwrap();
+ match variant {
+ Variant::Null => self.append_null(),
+ Variant::BooleanTrue => self.append_bool(true),
+ Variant::BooleanFalse => self.append_bool(false),
+ Variant::Int8(v) => self.append_int8(v),
+ Variant::Int16(v) => self.append_int16(v),
+ Variant::Int32(v) => self.append_int32(v),
+ Variant::Int64(v) => self.append_int64(v),
+ Variant::Date(v) => self.append_date(v),
+ Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
+ Variant::TimestampNtzMicros(v) =>
self.append_timestamp_ntz_micros(v),
+ Variant::Decimal4(decimal4) => self.append_decimal4(decimal4),
+ Variant::Decimal8(decimal8) => self.append_decimal8(decimal8),
+ Variant::Decimal16(decimal16) => self.append_decimal16(decimal16),
+ Variant::Float(v) => self.append_float(v),
+ Variant::Double(v) => self.append_double(v),
+ Variant::Binary(v) => self.append_binary(v),
+ Variant::String(s) => self.append_string(s),
+ Variant::ShortString(s) => self.append_short_string(s),
+ Variant::Object(obj) => self.append_object(metadata_builder, obj),
+ Variant::List(list) => self.append_list(metadata_builder, list),
+ }
}
+ /// Appends a variant to the buffer
fn try_append_variant<'m, 'd>(
&mut self,
variant: Variant<'m, 'd>,
@@ -279,35 +353,8 @@ impl ValueBuffer {
Variant::Binary(v) => self.append_binary(v),
Variant::String(s) => self.append_string(s),
Variant::ShortString(s) => self.append_short_string(s),
- Variant::Object(obj) => {
- let metadata_field_names = metadata_builder
- .field_names
- .iter()
- .enumerate()
- .map(|(i, f)| (f.clone(), i))
- .collect::<HashMap<_, _>>();
-
- let mut object_builder = self.new_object(metadata_builder);
-
- // first add all object fields that exist in metadata builder
- let mut object_fields = obj.iter().collect::<Vec<_>>();
-
- object_fields
- .sort_by_key(|(field_name, _)|
metadata_field_names.get(field_name as &str));
-
- for (field_name, value) in object_fields {
- object_builder.insert(field_name, value);
- }
-
- object_builder.finish()?;
- }
- Variant::List(list) => {
- let mut list_builder = self.new_list(metadata_builder);
- for value in list.iter() {
- list_builder.append_value(value);
- }
- list_builder.finish();
- }
+ Variant::Object(obj) => self.try_append_object(metadata_builder,
obj)?,
+ Variant::List(list) => self.try_append_list(metadata_builder,
list)?,
}
Ok(())
diff --git a/parquet-variant/src/variant/metadata.rs
b/parquet-variant/src/variant/metadata.rs
index c75f232aa7..f957ebb6f1 100644
--- a/parquet-variant/src/variant/metadata.rs
+++ b/parquet-variant/src/variant/metadata.rs
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+use std::collections::HashSet;
+
use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice,
string_from_slice};
@@ -125,7 +127,7 @@ impl VariantMetadataHeader {
///
/// [`Variant`]: crate::Variant
/// [Variant Spec]:
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone)]
pub struct VariantMetadata<'m> {
pub(crate) bytes: &'m [u8],
header: VariantMetadataHeader,
@@ -346,6 +348,30 @@ impl<'m> VariantMetadata<'m> {
}
}
+// According to the spec, metadata dictionaries are not required to be in a
specific order,
+// to enable flexibility when constructing Variant values
+//
+// Instead of comparing the raw bytes of 2 variant metadata instances, this
implementation
+// checks whether the dictionary entries are equal -- regardless of their
sorting order
+impl<'m> PartialEq for VariantMetadata<'m> {
+ fn eq(&self, other: &Self) -> bool {
+ let is_equal = self.is_empty() == other.is_empty()
+ && self.is_fully_validated() == other.is_fully_validated()
+ && self.first_value_byte == other.first_value_byte
+ && self.validated == other.validated;
+
+ let other_field_names: HashSet<&'m str> =
HashSet::from_iter(other.iter());
+
+ for field_name in self.iter() {
+ if !other_field_names.contains(field_name) {
+ return false;
+ }
+ }
+
+ is_equal
+ }
+}
+
/// Retrieves the ith dictionary entry, panicking if the index is out of
bounds. Accessing
/// [unvalidated] input could also panic if the underlying bytes are invalid.
///
@@ -360,6 +386,7 @@ impl std::ops::Index<usize> for VariantMetadata<'_> {
#[cfg(test)]
mod tests {
+
use super::*;
/// `"cat"`, `"dog"` – valid metadata
diff --git a/parquet-variant/src/variant/object.rs
b/parquet-variant/src/variant/object.rs
index 50094cb39d..bce2ffc876 100644
--- a/parquet-variant/src/variant/object.rs
+++ b/parquet-variant/src/variant/object.rs
@@ -14,11 +14,13 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
+
use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
use crate::utils::{
first_byte_from_slice, overflow_error, slice_from_slice,
try_binary_search_range_by,
};
use crate::variant::{Variant, VariantMetadata};
+use std::collections::HashMap;
use arrow_schema::ArrowError;
@@ -114,7 +116,7 @@ impl VariantObjectHeader {
///
/// [valid]: VariantMetadata#Validation
/// [Variant spec]:
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone)]
pub struct VariantObject<'m, 'v> {
pub metadata: VariantMetadata<'m>,
pub value: &'v [u8],
@@ -401,6 +403,38 @@ impl<'m, 'v> VariantObject<'m, 'v> {
}
}
+// Custom implementation of PartialEq for variant objects
+//
+// According to the spec, field values are not required to be in the same
order as the field IDs,
+// to enable flexibility when constructing Variant values
+//
+// Instead of comparing the raw bytes of 2 variant objects, this
implementation recursively
+// checks whether the field values are equal -- regardless of their order
+impl<'m, 'v> PartialEq for VariantObject<'m, 'v> {
+ fn eq(&self, other: &Self) -> bool {
+ let mut is_equal = self.metadata == other.metadata
+ && self.header == other.header
+ && self.num_elements == other.num_elements
+ && self.first_field_offset_byte == other.first_field_offset_byte
+ && self.first_value_byte == other.first_value_byte
+ && self.validated == other.validated;
+
+ // value validation
+ let other_fields: HashMap<&str, Variant> =
HashMap::from_iter(other.iter());
+
+ for (field_name, variant) in self.iter() {
+ match other_fields.get(field_name as &str) {
+ Some(other_variant) => {
+ is_equal = is_equal && variant == *other_variant;
+ }
+ None => return false,
+ }
+ }
+
+ is_equal
+ }
+}
+
#[cfg(test)]
mod tests {
use crate::VariantBuilder;
@@ -732,4 +766,187 @@ mod tests {
test_variant_object_with_large_data(16777216 + 1,
OffsetSizeBytes::Four);
// 2^24
}
+
+ #[test]
+ fn test_objects_with_same_fields_are_equal() {
+ let mut b = VariantBuilder::new();
+ let mut o = b.new_object();
+
+ o.insert("b", ());
+ o.insert("c", ());
+ o.insert("a", ());
+
+ o.finish().unwrap();
+
+ let (m, v) = b.finish();
+
+ let v1 = Variant::try_new(&m, &v).unwrap();
+ let v2 = Variant::try_new(&m, &v).unwrap();
+
+ assert_eq!(v1, v2);
+ }
+
+ #[test]
+ fn test_same_objects_with_different_builder_are_equal() {
+ let mut b = VariantBuilder::new();
+ let mut o = b.new_object();
+
+ o.insert("a", ());
+ o.insert("b", false);
+
+ o.finish().unwrap();
+ let (m, v) = b.finish();
+
+ let v1 = Variant::try_new(&m, &v).unwrap();
+
+ let mut b = VariantBuilder::new();
+ let mut o = b.new_object();
+
+ o.insert("a", ());
+ o.insert("b", false);
+
+ o.finish().unwrap();
+ let (m, v) = b.finish();
+
+ let v2 = Variant::try_new(&m, &v).unwrap();
+
+ assert_eq!(v1, v2);
+ }
+
+ #[test]
+ fn test_objects_with_different_values_are_not_equal() {
+ let mut b = VariantBuilder::new();
+ let mut o = b.new_object();
+
+ o.insert("a", ());
+ o.insert("b", 4.3);
+
+ o.finish().unwrap();
+
+ let (m, v) = b.finish();
+
+ let v1 = Variant::try_new(&m, &v).unwrap();
+
+ // second object, same field name but different values
+ let mut b = VariantBuilder::new();
+ let mut o = b.new_object();
+
+ o.insert("a", ());
+ let mut inner_o = o.new_object("b");
+ inner_o.insert("a", 3.3);
+ inner_o.finish().unwrap();
+ o.finish().unwrap();
+
+ let (m, v) = b.finish();
+
+ let v2 = Variant::try_new(&m, &v).unwrap();
+
+ let m1 = v1.metadata().unwrap();
+ let m2 = v2.metadata().unwrap();
+
+ // metadata would be equal since they contain the same keys
+ assert_eq!(m1, m2);
+
+ // but the objects are not equal
+ assert_ne!(v1, v2);
+ }
+
+ #[test]
+ fn test_objects_with_different_field_names_are_not_equal() {
+ let mut b = VariantBuilder::new();
+ let mut o = b.new_object();
+
+ o.insert("a", ());
+ o.insert("b", 4.3);
+
+ o.finish().unwrap();
+
+ let (m, v) = b.finish();
+
+ let v1 = Variant::try_new(&m, &v).unwrap();
+
+ // second object, same field name but different values
+ let mut b = VariantBuilder::new();
+ let mut o = b.new_object();
+
+ o.insert("aardvark", ());
+ o.insert("barracuda", 3.3);
+
+ o.finish().unwrap();
+
+ let (m, v) = b.finish();
+ let v2 = Variant::try_new(&m, &v).unwrap();
+
+ assert_ne!(v1, v2);
+ }
+
+ #[test]
+ fn test_objects_with_different_insertion_order_are_equal() {
+ let mut b = VariantBuilder::new();
+ let mut o = b.new_object();
+
+ o.insert("b", false);
+ o.insert("a", ());
+
+ o.finish().unwrap();
+
+ let (m, v) = b.finish();
+
+ let v1 = Variant::try_new(&m, &v).unwrap();
+ assert!(!v1.metadata().unwrap().is_sorted());
+
+ // create another object pre-filled with field names, b and a
+ // but insert the fields in the order of a, b
+ let mut b = VariantBuilder::new().with_field_names(["b",
"a"].into_iter());
+ let mut o = b.new_object();
+
+ o.insert("a", ());
+ o.insert("b", false);
+
+ o.finish().unwrap();
+
+ let (m, v) = b.finish();
+
+ let v2 = Variant::try_new(&m, &v).unwrap();
+
+ // v2 should also have a unsorted dictionary
+ assert!(!v2.metadata().unwrap().is_sorted());
+
+ assert_eq!(v1, v2);
+ }
+
+ #[test]
+ fn test_objects_with_differing_metadata_are_equal() {
+ let mut b = VariantBuilder::new();
+ let mut o = b.new_object();
+
+ o.insert("a", ());
+ o.insert("b", 4.3);
+
+ o.finish().unwrap();
+
+ let (m, v) = b.finish();
+
+ let v1 = Variant::try_new(&m, &v).unwrap();
+ // v1 is sorted
+ assert!(v1.metadata().unwrap().is_sorted());
+
+ // create a second object with different insertion order
+ let mut b = VariantBuilder::new();
+ let mut o = b.new_object();
+
+ o.insert("b", 4.3);
+ o.insert("a", ());
+
+ o.finish().unwrap();
+
+ let (m, v) = b.finish();
+
+ let v2 = Variant::try_new(&m, &v).unwrap();
+ // v2 is not sorted
+ assert!(!v2.metadata().unwrap().is_sorted());
+
+ // objects are still logically equal
+ assert_eq!(v1, v2);
+ }
}