This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new f445bfcb3c perf: improve calculating length performance for 
`GenericByteArray` in row conversion (#9078)
f445bfcb3c is described below

commit f445bfcb3ccef97acb8400c79511626a76350fc6
Author: Raz Luvaton <[email protected]>
AuthorDate: Wed Jan 14 14:32:58 2026 +0200

    perf: improve calculating length performance for `GenericByteArray` in row 
conversion (#9078)
    
    # Which issue does this PR close?
    
    N/A
    
    # Rationale for this change
    
    Making the row length calculation faster which result in faster row
    conversion
    
    # What changes are included in this PR?
    
    Instead of iterating over the items in the array and getting the length
    from the byte slice, we use the offsets directly and zip with nulls if
    necessary
    
    # Are these changes tested?
    
    Existing tests
    
    # Are there any user-facing changes?
    
    Faster encoding
    
    ------
    
    Split to 2 more PRs as the other 2 add a change to the public API
    
    Related to:
    - #9079
    - #9080
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-row/src/lib.rs      | 50 +++++++++++++++++++++++++++--------------------
 arrow-row/src/variable.rs | 15 +++++++++++---
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
index 28c65c5994..361b554414 100644
--- a/arrow-row/src/lib.rs
+++ b/arrow-row/src/lib.rs
@@ -164,7 +164,7 @@ use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
 use arrow_array::cast::*;
-use arrow_array::types::{ArrowDictionaryKeyType, ByteViewType};
+use arrow_array::types::{ArrowDictionaryKeyType, ByteArrayType, ByteViewType};
 use arrow_array::*;
 use arrow_buffer::{ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder};
@@ -1545,27 +1545,11 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) 
-> LengthTracker {
                     array => tracker.push_fixed(fixed::encoded_len(array)),
                     DataType::Null => {},
                     DataType::Boolean => tracker.push_fixed(bool::ENCODED_LEN),
-                    DataType::Binary => tracker.push_variable(
-                        as_generic_binary_array::<i32>(array)
-                            .iter()
-                            .map(|slice| variable::encoded_len(slice))
-                    ),
-                    DataType::LargeBinary => tracker.push_variable(
-                        as_generic_binary_array::<i64>(array)
-                            .iter()
-                            .map(|slice| variable::encoded_len(slice))
-                    ),
+                    DataType::Binary => push_generic_byte_array_lengths(&mut 
tracker, as_generic_binary_array::<i32>(array)),
+                    DataType::LargeBinary => 
push_generic_byte_array_lengths(&mut tracker, 
as_generic_binary_array::<i64>(array)),
                     DataType::BinaryView => push_byte_view_array_lengths(&mut 
tracker, array.as_binary_view()),
-                    DataType::Utf8 => tracker.push_variable(
-                        array.as_string::<i32>()
-                            .iter()
-                            .map(|slice| variable::encoded_len(slice.map(|x| 
x.as_bytes())))
-                    ),
-                    DataType::LargeUtf8 => tracker.push_variable(
-                        array.as_string::<i64>()
-                            .iter()
-                            .map(|slice| variable::encoded_len(slice.map(|x| 
x.as_bytes())))
-                    ),
+                    DataType::Utf8 => push_generic_byte_array_lengths(&mut 
tracker, array.as_string::<i32>()),
+                    DataType::LargeUtf8 => 
push_generic_byte_array_lengths(&mut tracker, array.as_string::<i64>()),
                     DataType::Utf8View => push_byte_view_array_lengths(&mut 
tracker, array.as_string_view()),
                     DataType::FixedSizeBinary(len) => {
                         let len = len.to_usize().unwrap();
@@ -1656,6 +1640,30 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) 
-> LengthTracker {
     tracker
 }
 
+/// Add to [`LengthTracker`] the encoded length of each item in the 
[`GenericByteArray`]
+fn push_generic_byte_array_lengths<T: ByteArrayType>(
+    tracker: &mut LengthTracker,
+    array: &GenericByteArray<T>,
+) {
+    if let Some(nulls) = array.nulls().filter(|n| n.null_count() > 0) {
+        tracker.push_variable(
+            array
+                .offsets()
+                .lengths()
+                .zip(nulls.iter())
+                .map(|(length, is_valid)| if is_valid { Some(length) } else { 
None })
+                .map(variable::padded_length),
+        )
+    } else {
+        tracker.push_variable(
+            array
+                .offsets()
+                .lengths()
+                .map(variable::non_null_padded_length),
+        )
+    }
+}
+
 /// Add to [`LengthTracker`] the encoded length of each item in the 
[`GenericByteViewArray`]
 fn push_byte_view_array_lengths<T: ByteViewType>(
     tracker: &mut LengthTracker,
diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs
index 73e19b197f..1c1df00528 100644
--- a/arrow-row/src/variable.rs
+++ b/arrow-row/src/variable.rs
@@ -55,11 +55,20 @@ pub fn encoded_len(a: Option<&[u8]>) -> usize {
 #[inline]
 pub fn padded_length(a: Option<usize>) -> usize {
     match a {
-        Some(a) if a <= BLOCK_SIZE => 1 + ceil(a, MINI_BLOCK_SIZE) * 
(MINI_BLOCK_SIZE + 1),
+        Some(a) => non_null_padded_length(a),
+        None => 1,
+    }
+}
+
+/// Returns the padded length of the encoded length of the given length
+#[inline]
+pub(crate) fn non_null_padded_length(len: usize) -> usize {
+    if len <= BLOCK_SIZE {
+        1 + ceil(len, MINI_BLOCK_SIZE) * (MINI_BLOCK_SIZE + 1)
+    } else {
         // Each miniblock ends with a 1 byte continuation, therefore add
         // `(MINI_BLOCK_COUNT - 1)` additional bytes over non-miniblock size
-        Some(a) => MINI_BLOCK_COUNT + ceil(a, BLOCK_SIZE) * (BLOCK_SIZE + 1),
-        None => 1,
+        MINI_BLOCK_COUNT + ceil(len, BLOCK_SIZE) * (BLOCK_SIZE + 1)
     }
 }
 

Reply via email to