This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new d8b5ef7595 [Variant] Avoid superflous validation checks (#7906)
d8b5ef7595 is described below
commit d8b5ef75950e88dae6f5c8f909a78b2ac1d097a4
Author: Matthew Kim <[email protected]>
AuthorDate: Sat Jul 12 15:06:08 2025 -0400
[Variant] Avoid superflous validation checks (#7906)
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/issues/7900
# Rational for this change
We can avoid certain checks in the validation code since other checks
already guarantee these invariants for us
---
parquet-variant/src/variant.rs | 12 ++++++++++++
parquet-variant/src/variant/list.rs | 27 +++++++++------------------
parquet-variant/src/variant/metadata.rs | 25 +++++++++++++++----------
parquet-variant/src/variant/object.rs | 15 ++-------------
4 files changed, 38 insertions(+), 41 deletions(-)
diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs
index 8138549b1a..ce593cd2b0 100644
--- a/parquet-variant/src/variant.rs
+++ b/parquet-variant/src/variant.rs
@@ -1199,8 +1199,20 @@ impl TryFrom<(i128, u8)> for Variant<'_, '_> {
#[cfg(test)]
mod tests {
+
use super::*;
+ #[test]
+ fn test_empty_variant_will_fail() {
+ let metadata = VariantMetadata::try_new(&[1, 0, 0]).unwrap();
+
+ let err = Variant::try_new_with_metadata(metadata, &[]).unwrap_err();
+
+ assert!(matches!(
+ err,
+ ArrowError::InvalidArgumentError(ref msg) if msg == "Received
empty bytes"));
+ }
+
#[test]
fn test_construct_short_string() {
let short_string = ShortString::try_new("norm").expect("should fit in
short string");
diff --git a/parquet-variant/src/variant/list.rs
b/parquet-variant/src/variant/list.rs
index 17f87a2e0d..6de6ed8307 100644
--- a/parquet-variant/src/variant/list.rs
+++ b/parquet-variant/src/variant/list.rs
@@ -216,28 +216,19 @@ impl<'m, 'v> VariantList<'m, 'v> {
self.header.first_offset_byte() as _..self.first_value_byte as
_,
)?;
- let offsets =
- map_bytes_to_offsets(offset_buffer,
self.header.offset_size).collect::<Vec<_>>();
-
- // Validate offsets are in-bounds and monotonically increasing.
- // Since shallow verification checks whether the first and last
offsets are in-bounds,
- // we can also verify all offsets are in-bounds by checking if
offsets are monotonically increasing.
- let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
- if !are_offsets_monotonic {
- return Err(ArrowError::InvalidArgumentError(
- "offsets are not monotonically increasing".to_string(),
- ));
- }
-
let value_buffer = slice_from_slice(self.value,
self.first_value_byte as _..)?;
// Validate whether values are valid variant objects
- for i in 1..offsets.len() {
- let start_offset = offsets[i - 1];
- let end_offset = offsets[i];
-
- let value_bytes = slice_from_slice(value_buffer,
start_offset..end_offset)?;
+ //
+ // Since we use offsets to slice into the value buffer, this also
verifies all offsets are in-bounds
+ // and monotonically increasing
+ let mut offset_iter = map_bytes_to_offsets(offset_buffer,
self.header.offset_size);
+ let mut current_offset = offset_iter.next().unwrap_or(0);
+
+ for next_offset in offset_iter {
+ let value_bytes = slice_from_slice(value_buffer,
current_offset..next_offset)?;
Variant::try_new_with_metadata(self.metadata.clone(),
value_bytes)?;
+ current_offset = next_offset;
}
self.validated = true;
diff --git a/parquet-variant/src/variant/metadata.rs
b/parquet-variant/src/variant/metadata.rs
index 007122af75..9653473b10 100644
--- a/parquet-variant/src/variant/metadata.rs
+++ b/parquet-variant/src/variant/metadata.rs
@@ -237,22 +237,15 @@ impl<'m> VariantMetadata<'m> {
let offsets =
map_bytes_to_offsets(offset_bytes,
self.header.offset_size).collect::<Vec<_>>();
- // Validate offsets are in-bounds and monotonically increasing.
- // Since shallow validation ensures the first and last offsets are
in bounds, we can also verify all offsets
- // are in-bounds by checking if offsets are monotonically
increasing.
- let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
- if !are_offsets_monotonic {
- return Err(ArrowError::InvalidArgumentError(
- "offsets not monotonically increasing".to_string(),
- ));
- }
-
// Verify the string values in the dictionary are UTF-8 encoded
strings.
let value_buffer =
string_from_slice(self.bytes, 0, self.first_value_byte as
_..self.bytes.len())?;
if self.header.is_sorted {
// Validate the dictionary values are unique and
lexicographically sorted
+ //
+ // Since we use the offsets to access dictionary values, this
also validates
+ // offsets are in-bounds and monotonically increasing
let are_dictionary_values_unique_and_sorted =
(1..offsets.len())
.map(|i| {
let field_range = offsets[i - 1]..offsets[i];
@@ -268,6 +261,18 @@ impl<'m> VariantMetadata<'m> {
"dictionary values are not unique and
ordered".to_string(),
));
}
+ } else {
+ // Validate offsets are in-bounds and monotonically increasing
+ //
+ // Since shallow validation ensures the first and last offsets
are in bounds,
+ // we can also verify all offsets are in-bounds by checking if
+ // offsets are monotonically increasing
+ let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
+ if !are_offsets_monotonic {
+ return Err(ArrowError::InvalidArgumentError(
+ "offsets not monotonically increasing".to_string(),
+ ));
+ }
}
self.validated = true;
diff --git a/parquet-variant/src/variant/object.rs
b/parquet-variant/src/variant/object.rs
index dd6da08fbe..e2c6cb7b79 100644
--- a/parquet-variant/src/variant/object.rs
+++ b/parquet-variant/src/variant/object.rs
@@ -242,6 +242,8 @@ impl<'m, 'v> VariantObject<'m, 'v> {
} else {
// The metadata dictionary can't guarantee uniqueness or
sortedness, so we have to parse out the corresponding field names
// to check lexicographical order
+ //
+ // Since we are probing the metadata dictionary by field id,
this also verifies field ids are in-bounds
let are_field_names_sorted = field_ids
.iter()
.map(|&i| self.metadata.get(i))
@@ -253,19 +255,6 @@ impl<'m, 'v> VariantObject<'m, 'v> {
"field names not sorted".to_string(),
));
}
-
- // Since field ids are not guaranteed to be sorted, scan over
all field ids
- // and check that field ids are less than dictionary size
-
- let are_field_ids_in_bounds = field_ids
- .iter()
- .all(|&id| id < self.metadata.dictionary_size());
-
- if !are_field_ids_in_bounds {
- return Err(ArrowError::InvalidArgumentError(
- "field id is not valid".to_string(),
- ));
- }
}
// Validate whether values are valid variant objects