This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 9213ffd035 perf: improve performance of encoding `GenericByteArray` by
8% (#9054)
9213ffd035 is described below
commit 9213ffd035f32b657965096bf5781d1ca1d5cf67
Author: Raz Luvaton <[email protected]>
AuthorDate: Tue Dec 30 14:36:17 2025 +0200
perf: improve performance of encoding `GenericByteArray` by 8% (#9054)
# Which issue does this PR close?
N/A
# Rationale for this change
Make row conversion faster
# What changes are included in this PR?
created "manual" iterator over the byte array and offsets with
optimizations for no nulls
# Are these changes tested?
Existing tests
# Are there any user-facing changes?
No
---
arrow-row/src/lib.rs | 14 ++++++--------
arrow-row/src/variable.rs | 46 +++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 51 insertions(+), 9 deletions(-)
diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
index aa6543485f..3ffa71e98c 100644
--- a/arrow-row/src/lib.rs
+++ b/arrow-row/src/lib.rs
@@ -1644,24 +1644,22 @@ fn encode_column(
}
}
DataType::Binary => {
- variable::encode(data, offsets,
as_generic_binary_array::<i32>(column).iter(), opts)
+ variable::encode_generic_byte_array(data, offsets,
as_generic_binary_array::<i32>(column), opts)
}
DataType::BinaryView => {
variable::encode(data, offsets,
column.as_binary_view().iter(), opts)
}
DataType::LargeBinary => {
- variable::encode(data, offsets,
as_generic_binary_array::<i64>(column).iter(), opts)
+ variable::encode_generic_byte_array(data, offsets,
as_generic_binary_array::<i64>(column), opts)
}
- DataType::Utf8 => variable::encode(
+ DataType::Utf8 => variable::encode_generic_byte_array(
data, offsets,
- column.as_string::<i32>().iter().map(|x| x.map(|x|
x.as_bytes())),
+ column.as_string::<i32>(),
opts,
),
- DataType::LargeUtf8 => variable::encode(
+ DataType::LargeUtf8 => variable::encode_generic_byte_array(
data, offsets,
- column.as_string::<i64>()
- .iter()
- .map(|x| x.map(|x| x.as_bytes())),
+ column.as_string::<i64>(),
opts,
),
DataType::Utf8View => variable::encode(
diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs
index ac2c4cb97c..73e19b197f 100644
--- a/arrow-row/src/variable.rs
+++ b/arrow-row/src/variable.rs
@@ -17,9 +17,10 @@
use crate::null_sentinel;
use arrow_array::builder::BufferBuilder;
+use arrow_array::types::ByteArrayType;
use arrow_array::*;
-use arrow_buffer::MutableBuffer;
use arrow_buffer::bit_util::ceil;
+use arrow_buffer::{ArrowNativeType, MutableBuffer};
use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN};
use arrow_schema::{DataType, SortOptions};
use builder::make_view;
@@ -84,6 +85,48 @@ pub fn encode<'a, I: Iterator<Item = Option<&'a [u8]>>>(
}
}
+/// Calls [`encode`] with optimized iterator for generic byte arrays
+pub(crate) fn encode_generic_byte_array<T: ByteArrayType>(
+ data: &mut [u8],
+ offsets: &mut [usize],
+ input_array: &GenericByteArray<T>,
+ opts: SortOptions,
+) {
+ let input_offsets = input_array.value_offsets();
+ let bytes = input_array.values().as_slice();
+
+ if let Some(null_buffer) = input_array.nulls().filter(|x| x.null_count() >
0) {
+ let input_iter =
+ input_offsets
+ .windows(2)
+ .zip(null_buffer.iter())
+ .map(|(start_end, is_valid)| {
+ if is_valid {
+ let item_range =
start_end[0].as_usize()..start_end[1].as_usize();
+ // SAFETY: the offsets of the input are valid by
construction
+ // so it is ok to use unsafe here
+ let item = unsafe { bytes.get_unchecked(item_range) };
+ Some(item)
+ } else {
+ None
+ }
+ });
+
+ encode(data, offsets, input_iter, opts);
+ } else {
+ // Skip null checks
+ let input_iter = input_offsets.windows(2).map(|start_end| {
+ let item_range = start_end[0].as_usize()..start_end[1].as_usize();
+ // SAFETY: the offsets of the input are valid by construction
+ // so it is ok to use unsafe here
+ let item = unsafe { bytes.get_unchecked(item_range) };
+ Some(item)
+ });
+
+ encode(data, offsets, input_iter, opts);
+ }
+}
+
pub fn encode_null(out: &mut [u8], opts: SortOptions) -> usize {
out[0] = null_sentinel(opts);
1
@@ -97,6 +140,7 @@ pub fn encode_empty(out: &mut [u8], opts: SortOptions) ->
usize {
1
}
+#[inline]
pub fn encode_one(out: &mut [u8], val: Option<&[u8]>, opts: SortOptions) ->
usize {
match val {
None => encode_null(out, opts),