This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 860b2db748 parquet: reduce clone in delta byte array decoder (#9282)
860b2db748 is described below
commit 860b2db748f11fc93960793ede9315f2962d4dfc
Author: Lanqing Yang <[email protected]>
AuthorDate: Sat Jan 31 04:41:38 2026 -0800
parquet: reduce clone in delta byte array decoder (#9282)
# Which issue does this PR close?
small optimization
# Rationale for this change
key insight is the byte clone is cheap just a ref count compare to vec
clone is a alloc + memcopy.
before
```
let mut result = Vec::new(); // alloc #1
result.extend_from_slice(prefix);
result.extend_from_slice(suffix);
let data = Bytes::from(result.clone()); // alloc #2 + memcpy
item.set_from_bytes(data);
self.previous_value = result; // keep Vec
```
after
```
let mut result = Vec::with_capacity(prefix_len + suffix.len()); // alloc #1
result.extend_from_slice(&self.previous_value[..prefix_len]);
result.extend_from_slice(suffix);
let data = Bytes::from(result); // no alloc, takes Vec buffer
item.set_from_bytes(data.clone()); // cheap refcount bump
self.previous_value = data; // move, no alloc
```
# What changes are included in this PR?
previous_value type changed to Bytes
preallocate result vec capacity.
# Are these changes tested?
the existing test should pass
# Are there any user-facing changes?
no
---
parquet/src/encodings/decoding.rs | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/parquet/src/encodings/decoding.rs
b/parquet/src/encodings/decoding.rs
index 1f81c67dab..58430820a9 100644
--- a/parquet/src/encodings/decoding.rs
+++ b/parquet/src/encodings/decoding.rs
@@ -1008,7 +1008,8 @@ pub struct DeltaByteArrayDecoder<T: DataType> {
suffix_decoder: Option<DeltaLengthByteArrayDecoder<ByteArrayType>>,
// The last byte array, used to derive the current prefix
- previous_value: Vec<u8>,
+ // Stored as Bytes to avoid clone allocation when creating output
+ previous_value: Bytes,
// Number of values left
num_values: usize,
@@ -1030,7 +1031,7 @@ impl<T: DataType> DeltaByteArrayDecoder<T> {
prefix_lengths: vec![],
current_idx: 0,
suffix_decoder: None,
- previous_value: vec![],
+ previous_value: Bytes::new(),
num_values: 0,
_phantom: PhantomData,
}
@@ -1053,7 +1054,7 @@ impl<T: DataType> Decoder<T> for DeltaByteArrayDecoder<T>
{
self.suffix_decoder = Some(suffix_decoder);
self.num_values = num_prefixes;
self.current_idx = 0;
- self.previous_value.clear();
+ self.previous_value = Bytes::new();
Ok(())
}
_ => Err(general_err!(
@@ -1081,14 +1082,14 @@ impl<T: DataType> Decoder<T> for
DeltaByteArrayDecoder<T> {
let prefix_len = self.prefix_lengths[self.current_idx] as
usize;
// Concatenate prefix with suffix
- let mut result = Vec::new();
+ let mut result = Vec::with_capacity(prefix_len +
suffix.len());
result.extend_from_slice(&self.previous_value[0..prefix_len]);
result.extend_from_slice(suffix);
- let data = Bytes::from(result.clone());
- item.set_from_bytes(data);
+ let data = Bytes::from(result);
+ item.set_from_bytes(data.clone());
- self.previous_value = result;
+ self.previous_value = data;
self.current_idx += 1;
}