This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new d8b5ef7595 [Variant] Avoid superflous validation checks (#7906)
d8b5ef7595 is described below

commit d8b5ef75950e88dae6f5c8f909a78b2ac1d097a4
Author: Matthew Kim <[email protected]>
AuthorDate: Sat Jul 12 15:06:08 2025 -0400

    [Variant] Avoid superflous validation checks (#7906)
    
    # Which issue does this PR close?
    
    - Closes https://github.com/apache/arrow-rs/issues/7900
    
    # Rational for this change
    
    We can avoid certain checks in the validation code since other checks
    already guarantee these invariants for us
---
 parquet-variant/src/variant.rs          | 12 ++++++++++++
 parquet-variant/src/variant/list.rs     | 27 +++++++++------------------
 parquet-variant/src/variant/metadata.rs | 25 +++++++++++++++----------
 parquet-variant/src/variant/object.rs   | 15 ++-------------
 4 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs
index 8138549b1a..ce593cd2b0 100644
--- a/parquet-variant/src/variant.rs
+++ b/parquet-variant/src/variant.rs
@@ -1199,8 +1199,20 @@ impl TryFrom<(i128, u8)> for Variant<'_, '_> {
 
 #[cfg(test)]
 mod tests {
+
     use super::*;
 
+    #[test]
+    fn test_empty_variant_will_fail() {
+        let metadata = VariantMetadata::try_new(&[1, 0, 0]).unwrap();
+
+        let err = Variant::try_new_with_metadata(metadata, &[]).unwrap_err();
+
+        assert!(matches!(
+            err,
+            ArrowError::InvalidArgumentError(ref msg) if msg == "Received 
empty bytes"));
+    }
+
     #[test]
     fn test_construct_short_string() {
         let short_string = ShortString::try_new("norm").expect("should fit in 
short string");
diff --git a/parquet-variant/src/variant/list.rs 
b/parquet-variant/src/variant/list.rs
index 17f87a2e0d..6de6ed8307 100644
--- a/parquet-variant/src/variant/list.rs
+++ b/parquet-variant/src/variant/list.rs
@@ -216,28 +216,19 @@ impl<'m, 'v> VariantList<'m, 'v> {
                 self.header.first_offset_byte() as _..self.first_value_byte as 
_,
             )?;
 
-            let offsets =
-                map_bytes_to_offsets(offset_buffer, 
self.header.offset_size).collect::<Vec<_>>();
-
-            // Validate offsets are in-bounds and monotonically increasing.
-            // Since shallow verification checks whether the first and last 
offsets are in-bounds,
-            // we can also verify all offsets are in-bounds by checking if 
offsets are monotonically increasing.
-            let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
-            if !are_offsets_monotonic {
-                return Err(ArrowError::InvalidArgumentError(
-                    "offsets are not monotonically increasing".to_string(),
-                ));
-            }
-
             let value_buffer = slice_from_slice(self.value, 
self.first_value_byte as _..)?;
 
             // Validate whether values are valid variant objects
-            for i in 1..offsets.len() {
-                let start_offset = offsets[i - 1];
-                let end_offset = offsets[i];
-
-                let value_bytes = slice_from_slice(value_buffer, 
start_offset..end_offset)?;
+            //
+            // Since we use offsets to slice into the value buffer, this also 
verifies all offsets are in-bounds
+            // and monotonically increasing
+            let mut offset_iter = map_bytes_to_offsets(offset_buffer, 
self.header.offset_size);
+            let mut current_offset = offset_iter.next().unwrap_or(0);
+
+            for next_offset in offset_iter {
+                let value_bytes = slice_from_slice(value_buffer, 
current_offset..next_offset)?;
                 Variant::try_new_with_metadata(self.metadata.clone(), 
value_bytes)?;
+                current_offset = next_offset;
             }
 
             self.validated = true;
diff --git a/parquet-variant/src/variant/metadata.rs 
b/parquet-variant/src/variant/metadata.rs
index 007122af75..9653473b10 100644
--- a/parquet-variant/src/variant/metadata.rs
+++ b/parquet-variant/src/variant/metadata.rs
@@ -237,22 +237,15 @@ impl<'m> VariantMetadata<'m> {
             let offsets =
                 map_bytes_to_offsets(offset_bytes, 
self.header.offset_size).collect::<Vec<_>>();
 
-            // Validate offsets are in-bounds and monotonically increasing.
-            // Since shallow validation ensures the first and last offsets are 
in bounds, we can also verify all offsets
-            // are in-bounds by checking if offsets are monotonically 
increasing.
-            let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
-            if !are_offsets_monotonic {
-                return Err(ArrowError::InvalidArgumentError(
-                    "offsets not monotonically increasing".to_string(),
-                ));
-            }
-
             // Verify the string values in the dictionary are UTF-8 encoded 
strings.
             let value_buffer =
                 string_from_slice(self.bytes, 0, self.first_value_byte as 
_..self.bytes.len())?;
 
             if self.header.is_sorted {
                 // Validate the dictionary values are unique and 
lexicographically sorted
+                //
+                // Since we use the offsets to access dictionary values, this 
also validates
+                // offsets are in-bounds and monotonically increasing
                 let are_dictionary_values_unique_and_sorted = 
(1..offsets.len())
                     .map(|i| {
                         let field_range = offsets[i - 1]..offsets[i];
@@ -268,6 +261,18 @@ impl<'m> VariantMetadata<'m> {
                         "dictionary values are not unique and 
ordered".to_string(),
                     ));
                 }
+            } else {
+                // Validate offsets are in-bounds and monotonically increasing
+                //
+                // Since shallow validation ensures the first and last offsets 
are in bounds,
+                // we can also verify all offsets are in-bounds by checking if
+                // offsets are monotonically increasing
+                let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
+                if !are_offsets_monotonic {
+                    return Err(ArrowError::InvalidArgumentError(
+                        "offsets not monotonically increasing".to_string(),
+                    ));
+                }
             }
 
             self.validated = true;
diff --git a/parquet-variant/src/variant/object.rs 
b/parquet-variant/src/variant/object.rs
index dd6da08fbe..e2c6cb7b79 100644
--- a/parquet-variant/src/variant/object.rs
+++ b/parquet-variant/src/variant/object.rs
@@ -242,6 +242,8 @@ impl<'m, 'v> VariantObject<'m, 'v> {
             } else {
                 // The metadata dictionary can't guarantee uniqueness or 
sortedness, so we have to parse out the corresponding field names
                 // to check lexicographical order
+                //
+                // Since we are probing the metadata dictionary by field id, 
this also verifies field ids are in-bounds
                 let are_field_names_sorted = field_ids
                     .iter()
                     .map(|&i| self.metadata.get(i))
@@ -253,19 +255,6 @@ impl<'m, 'v> VariantObject<'m, 'v> {
                         "field names not sorted".to_string(),
                     ));
                 }
-
-                // Since field ids are not guaranteed to be sorted, scan over 
all field ids
-                // and check that field ids are less than dictionary size
-
-                let are_field_ids_in_bounds = field_ids
-                    .iter()
-                    .all(|&id| id < self.metadata.dictionary_size());
-
-                if !are_field_ids_in_bounds {
-                    return Err(ArrowError::InvalidArgumentError(
-                        "field id is not valid".to_string(),
-                    ));
-                }
             }
 
             // Validate whether values are valid variant objects

Reply via email to