This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 9213ffd035 perf: improve performance of encoding `GenericByteArray` by 
8% (#9054)
9213ffd035 is described below

commit 9213ffd035f32b657965096bf5781d1ca1d5cf67
Author: Raz Luvaton <[email protected]>
AuthorDate: Tue Dec 30 14:36:17 2025 +0200

    perf: improve performance of encoding `GenericByteArray` by 8% (#9054)
    
    # Which issue does this PR close?
    
    N/A
    
    # Rationale for this change
    
    Make row conversion faster
    
    # What changes are included in this PR?
    
    created "manual" iterator over the byte array and offsets with
    optimizations for no nulls
    
    # Are these changes tested?
    
    Existing tests
    
    # Are there any user-facing changes?
    
    No
---
 arrow-row/src/lib.rs      | 14 ++++++--------
 arrow-row/src/variable.rs | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
index aa6543485f..3ffa71e98c 100644
--- a/arrow-row/src/lib.rs
+++ b/arrow-row/src/lib.rs
@@ -1644,24 +1644,22 @@ fn encode_column(
                     }
                 }
                 DataType::Binary => {
-                    variable::encode(data, offsets, 
as_generic_binary_array::<i32>(column).iter(), opts)
+                    variable::encode_generic_byte_array(data, offsets, 
as_generic_binary_array::<i32>(column), opts)
                 }
                 DataType::BinaryView => {
                     variable::encode(data, offsets, 
column.as_binary_view().iter(), opts)
                 }
                 DataType::LargeBinary => {
-                    variable::encode(data, offsets, 
as_generic_binary_array::<i64>(column).iter(), opts)
+                    variable::encode_generic_byte_array(data, offsets, 
as_generic_binary_array::<i64>(column), opts)
                 }
-                DataType::Utf8 => variable::encode(
+                DataType::Utf8 => variable::encode_generic_byte_array(
                     data, offsets,
-                    column.as_string::<i32>().iter().map(|x| x.map(|x| 
x.as_bytes())),
+                    column.as_string::<i32>(),
                     opts,
                 ),
-                DataType::LargeUtf8 => variable::encode(
+                DataType::LargeUtf8 => variable::encode_generic_byte_array(
                     data, offsets,
-                    column.as_string::<i64>()
-                        .iter()
-                        .map(|x| x.map(|x| x.as_bytes())),
+                    column.as_string::<i64>(),
                     opts,
                 ),
                 DataType::Utf8View => variable::encode(
diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs
index ac2c4cb97c..73e19b197f 100644
--- a/arrow-row/src/variable.rs
+++ b/arrow-row/src/variable.rs
@@ -17,9 +17,10 @@
 
 use crate::null_sentinel;
 use arrow_array::builder::BufferBuilder;
+use arrow_array::types::ByteArrayType;
 use arrow_array::*;
-use arrow_buffer::MutableBuffer;
 use arrow_buffer::bit_util::ceil;
+use arrow_buffer::{ArrowNativeType, MutableBuffer};
 use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN};
 use arrow_schema::{DataType, SortOptions};
 use builder::make_view;
@@ -84,6 +85,48 @@ pub fn encode<'a, I: Iterator<Item = Option<&'a [u8]>>>(
     }
 }
 
+/// Calls [`encode`] with optimized iterator for generic byte arrays
+pub(crate) fn encode_generic_byte_array<T: ByteArrayType>(
+    data: &mut [u8],
+    offsets: &mut [usize],
+    input_array: &GenericByteArray<T>,
+    opts: SortOptions,
+) {
+    let input_offsets = input_array.value_offsets();
+    let bytes = input_array.values().as_slice();
+
+    if let Some(null_buffer) = input_array.nulls().filter(|x| x.null_count() > 
0) {
+        let input_iter =
+            input_offsets
+                .windows(2)
+                .zip(null_buffer.iter())
+                .map(|(start_end, is_valid)| {
+                    if is_valid {
+                        let item_range = 
start_end[0].as_usize()..start_end[1].as_usize();
+                        // SAFETY: the offsets of the input are valid by 
construction
+                        // so it is ok to use unsafe here
+                        let item = unsafe { bytes.get_unchecked(item_range) };
+                        Some(item)
+                    } else {
+                        None
+                    }
+                });
+
+        encode(data, offsets, input_iter, opts);
+    } else {
+        // Skip null checks
+        let input_iter = input_offsets.windows(2).map(|start_end| {
+            let item_range = start_end[0].as_usize()..start_end[1].as_usize();
+            // SAFETY: the offsets of the input are valid by construction
+            // so it is ok to use unsafe here
+            let item = unsafe { bytes.get_unchecked(item_range) };
+            Some(item)
+        });
+
+        encode(data, offsets, input_iter, opts);
+    }
+}
+
 pub fn encode_null(out: &mut [u8], opts: SortOptions) -> usize {
     out[0] = null_sentinel(opts);
     1
@@ -97,6 +140,7 @@ pub fn encode_empty(out: &mut [u8], opts: SortOptions) -> 
usize {
     1
 }
 
+#[inline]
 pub fn encode_one(out: &mut [u8], val: Option<&[u8]>, opts: SortOptions) -> 
usize {
     match val {
         None => encode_null(out, opts),

Reply via email to