This is an automated email from the ASF dual-hosted git repository.
jeffreyvo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new addf74d72d Improve `RunArray` documentation (#9019)
addf74d72d is described below
commit addf74d72d88684dabac21822c22f35e803135c5
Author: Jeffrey Vo <[email protected]>
AuthorDate: Sun Dec 21 23:08:21 2025 +0900
Improve `RunArray` documentation (#9019)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
N/A
# Rationale for this change
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
Whilst reviewing https://github.com/apache/datafusion/pull/18981 I found
it very confusing trying to follow along the logic for accounting for
slicing of RunArrays. Decided to see if I can improve the documentation
around it, especially to make it clear slicing acts logically not
physically.
# What changes are included in this PR?
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
Improve docstrings for RunArrays and RunEndBuffers.
Also add some tests for kernel operations on sliced RunArrays which
seemed to be missing.
# Are these changes tested?
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->
Doc changes only. Added some (expected) failing tests to showcase #9018
too.
# Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
Doc changes only.
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-array/src/array/run_array.rs | 74 +++++++++-----
arrow-buffer/src/buffer/run.rs | 198 ++++++++++++++++++++++++-------------
arrow-cast/src/cast/mod.rs | 17 ++++
arrow-select/src/concat.rs | 25 +++++
arrow-select/src/filter.rs | 20 ++++
arrow-select/src/interleave.rs | 26 +++++
arrow-select/src/take.rs | 21 ++++
7 files changed, 285 insertions(+), 96 deletions(-)
diff --git a/arrow-array/src/array/run_array.rs
b/arrow-array/src/array/run_array.rs
index b0365bd6cd..ed68c10f61 100644
--- a/arrow-array/src/array/run_array.rs
+++ b/arrow-array/src/array/run_array.rs
@@ -30,16 +30,15 @@ use crate::{
types::{Int16Type, Int32Type, Int64Type, RunEndIndexType},
};
-/// An array of [run-end encoded
values](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout)
+/// An array of [run-end encoded values].
///
-/// This encoding is variation on [run-length encoding
(RLE)](https://en.wikipedia.org/wiki/Run-length_encoding)
-/// and is good for representing data containing same values repeated
consecutively.
-///
-/// [`RunArray`] contains `run_ends` array and `values` array of same length.
-/// The `run_ends` array stores the indexes at which the run ends. The
`values` array
-/// stores the value of each run. Below example illustrates how a logical
array is represented in
-/// [`RunArray`]
+/// This encoding is variation on [run-length encoding (RLE)] and is good for
representing
+/// data containing the same values repeated consecutively.
///
+/// A [`RunArray`] consists of a `run_ends` buffer and a `values` array of
equivalent
+/// lengths. The `run_ends` buffer stores the indexes at which the run ends.
The
+/// `values` array stores the corresponding value of each run. The below
example
+/// illustrates how a logical array is represented by a [`RunArray`]:
///
/// ```text
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┐
@@ -60,6 +59,9 @@ use crate::{
/// Logical array
/// Contents
/// ```
+///
+/// [run-end encoded values]:
https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout
+/// [run-length encoding (RLE)]:
https://en.wikipedia.org/wiki/Run-length_encoding
pub struct RunArray<R: RunEndIndexType> {
data_type: DataType,
run_ends: RunEndBuffer<R::Native>,
@@ -77,8 +79,8 @@ impl<R: RunEndIndexType> Clone for RunArray<R> {
}
impl<R: RunEndIndexType> RunArray<R> {
- /// Calculates the logical length of the array encoded
- /// by the given run_ends array.
+ /// Calculates the logical length of the array encoded by treating the
`run_ends`
+ /// array as if it were a [`RunEndBuffer`].
pub fn logical_len(run_ends: &PrimitiveArray<R>) -> usize {
let len = run_ends.len();
if len == 0 {
@@ -87,9 +89,13 @@ impl<R: RunEndIndexType> RunArray<R> {
run_ends.value(len - 1).as_usize()
}
- /// Attempts to create RunArray using given run_ends (index where a run
ends)
- /// and the values (value of the run). Returns an error if the given data
is not compatible
- /// with RunEndEncoded specification.
+ /// Attempts to create a [`RunArray`] using the given `run_ends` and
`values`.
+ ///
+ /// # Errors
+ ///
+ /// - If `run_ends` and `values` have different lengths
+ /// - If `run_ends` has any null values
+ /// - If `run_ends` doesn't consist of strictly increasing positive
integers
pub fn try_new(run_ends: &PrimitiveArray<R>, values: &dyn Array) ->
Result<Self, ArrowError> {
let run_ends_type = run_ends.data_type().clone();
let values_type = values.data_type().clone();
@@ -117,25 +123,29 @@ impl<R: RunEndIndexType> RunArray<R> {
Ok(array_data.into())
}
- /// Returns a reference to [`RunEndBuffer`]
+ /// Returns a reference to the [`RunEndBuffer`].
pub fn run_ends(&self) -> &RunEndBuffer<R::Native> {
&self.run_ends
}
- /// Returns a reference to values array
+ /// Returns a reference to the values array.
///
- /// Note: any slicing of this [`RunArray`] array is not applied to the
returned array
- /// and must be handled separately
+ /// Any slicing of this [`RunArray`] array is **not** applied to the
returned
+ /// values here and must be handled separately.
pub fn values(&self) -> &ArrayRef {
&self.values
}
/// Returns the physical index at which the array slice starts.
+ ///
+ /// See [`RunEndBuffer::get_start_physical_index`].
pub fn get_start_physical_index(&self) -> usize {
self.run_ends.get_start_physical_index()
}
/// Returns the physical index at which the array slice ends.
+ ///
+ /// See [`RunEndBuffer::get_end_physical_index`].
pub fn get_end_physical_index(&self) -> usize {
self.run_ends.get_end_physical_index()
}
@@ -152,7 +162,6 @@ impl<R: RunEndIndexType> RunArray<R> {
/// assert_eq!(typed.value(1), "b");
/// assert!(typed.values().is_null(2));
/// ```
- ///
pub fn downcast<V: 'static>(&self) -> Option<TypedRunArray<'_, R, V>> {
let values = self.values.as_any().downcast_ref()?;
Some(TypedRunArray {
@@ -161,22 +170,31 @@ impl<R: RunEndIndexType> RunArray<R> {
})
}
- /// Returns index to the physical array for the given index to the logical
array.
- /// This function adjusts the input logical index based on
`ArrayData::offset`
- /// Performs a binary search on the run_ends array for the input index.
+ /// Calls [`RunEndBuffer::get_physical_index`].
///
/// The result is arbitrary if `logical_index >= self.len()`
pub fn get_physical_index(&self, logical_index: usize) -> usize {
self.run_ends.get_physical_index(logical_index)
}
- /// Returns the physical indices of the input logical indices. Returns
error if any of the logical
- /// index cannot be converted to physical index. The logical indices are
sorted and iterated along
- /// with run_ends array to find matching physical index. The approach used
here was chosen over
- /// finding physical index for each logical index using binary search
using the function
- /// `get_physical_index`. Running benchmarks on both approaches showed
that the approach used here
+ /// Given the input `logical_indices`, return the corresponding physical
index
+ /// for each, according to the underlying [`RunEndBuffer`], taking into
account
+ /// any slicing that has occurred.
+ ///
+ /// Returns an error if any of the provided logical indices is out of
range.
+ ///
+ /// # Implementation
+ ///
+ /// The logical indices are sorted and iterated along with the `run_ends`
buffer
+ /// to find the matching physical index. The approach used here was chosen
over
+ /// finding the physical index for each logical index using binary search
via
+ /// the function [`RunEndBuffer::get_physical_index`].
+ ///
+ /// Running benchmarks on both approaches showed that the approach used
here
/// scaled well for larger inputs.
+ ///
/// See
<https://github.com/apache/arrow-rs/pull/3622#issuecomment-1407753727> for more
details.
+ // TODO: this technically should be a method on RunEndBuffer
#[inline]
pub fn get_physical_indices<I>(&self, logical_indices: &[I]) ->
Result<Vec<usize>, ArrowError>
where
@@ -244,6 +262,10 @@ impl<R: RunEndIndexType> RunArray<R> {
}
/// Returns a zero-copy slice of this array with the indicated offset and
length.
+ ///
+ /// # Panics
+ ///
+ /// - Specified slice (`offset` + `length`) exceeds existing length
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
data_type: self.data_type.clone(),
diff --git a/arrow-buffer/src/buffer/run.rs b/arrow-buffer/src/buffer/run.rs
index 61fff6692e..8cb0f8ba4f 100644
--- a/arrow-buffer/src/buffer/run.rs
+++ b/arrow-buffer/src/buffer/run.rs
@@ -18,75 +18,108 @@
use crate::ArrowNativeType;
use crate::buffer::ScalarBuffer;
-/// A slice-able buffer of monotonically increasing, positive integers used to
store run-ends
+/// A buffer of monotonically increasing, positive integers used to store
run-ends.
///
-/// # Logical vs Physical
-///
-/// A [`RunEndBuffer`] is used to encode runs of the same value, the index of
each run is
-/// called the physical index. The logical index is then the corresponding
index in the logical
-/// run-encoded array, i.e. a single run of length `3`, would have the logical
indices `0..3`.
+/// Used to compactly represent runs of the same value. Values being
represented
+/// are stored in a separate buffer from this struct. See [`RunArray`] for an
example
+/// of how this is used with a companion array to represent the values.
///
-/// Each value in [`RunEndBuffer::values`] is the cumulative length of all
runs in the
-/// logical array, up to that physical index.
+/// # Logical vs Physical
///
-/// Consider a [`RunEndBuffer`] containing `[3, 4, 6]`. The maximum physical
index is `2`,
-/// as there are `3` values, and the maximum logical index is `5`, as the
maximum run end
-/// is `6`. The physical indices are therefore `[0, 0, 0, 1, 2, 2]`
+/// Physically, each value in the `run_ends` buffer is the cumulative length of
+/// all runs in the logical representation, up to that physical index. Consider
+/// the following example:
///
/// ```text
-/// ┌─────────┐ ┌─────────┐ ┌─────────┐
-/// │ 3 │ │ 0 │ ─┬──────▶ │ 0 │
-/// ├─────────┤ ├─────────┤ │ ├─────────┤
-/// │ 4 │ │ 1 │ ─┤ ┌────▶ │ 1 │
-/// ├─────────┤ ├─────────┤ │ │ ├─────────┤
-/// │ 6 │ │ 2 │ ─┘ │ ┌──▶ │ 2 │
-/// └─────────┘ ├─────────┤ │ │ └─────────┘
-/// run ends │ 3 │ ───┘ │ physical indices
-/// ├─────────┤ │
-/// │ 4 │ ─────┤
-/// ├─────────┤ │
-/// │ 5 │ ─────┘
-/// └─────────┘
-/// logical indices
+/// physical logical
+/// ┌─────────┬─────────┐ ┌─────────┬─────────┐
+/// │ 3 │ 0 │ ◄──────┬─ │ A │ 0 │
+/// ├─────────┼─────────┤ │ ├─────────┼─────────┤
+/// │ 4 │ 1 │ ◄────┐ ├─ │ A │ 1 │
+/// ├─────────┼─────────┤ │ │ ├─────────┼─────────┤
+/// │ 6 │ 2 │ ◄──┐ │ └─ │ A │ 2 │
+/// └─────────┴─────────┘ │ │ ├─────────┼─────────┤
+/// run-ends index │ └─── │ B │ 3 │
+/// │ ├─────────┼─────────┤
+/// logical_offset = 0 ├───── │ C │ 4 │
+/// logical_length = 6 │ ├─────────┼─────────┤
+/// └───── │ C │ 5 │
+/// └─────────┴─────────┘
+/// values index
/// ```
///
+/// A [`RunEndBuffer`] is physically the buffer and offset with length on the
left.
+/// In this case, the offset and length represent the whole buffer, so it is
essentially
+/// unsliced. See the section below on slicing for more details on how this
buffer
+/// handles slicing.
+///
+/// This means that multiple logical values are represented in the same
physical index,
+/// and multiple logical indices map to the same physical index. The
[`RunEndBuffer`]
+/// containing `[3, 4, 6]` is essentially the physical indices `[0, 0, 0, 1,
2, 2]`,
+/// and having a separately stored buffer of values such as `[A, B, C]` can
turn
+/// this into a representation of `[A, A, A, B, C, C]`.
+///
/// # Slicing
///
-/// In order to provide zero-copy slicing, this container stores a separate
offset and length
+/// In order to provide zero-copy slicing, this struct stores a separate
**logical**
+/// offset and length. Consider the following example:
+///
+/// ```text
+/// physical logical
+/// ┌─────────┬─────────┐ ┌ ─ ─ ─ ─ ┬ ─ ─ ─ ─ ┐
+/// │ 3 │ 0 │ ◄──────┐ A 0
+/// ├─────────┼─────────┤ │ ├── ─ ─ ─ ┼ ─ ─ ─ ─ ┤
+/// │ 4 │ 1 │ ◄────┐ │ A 1
+/// ├─────────┼─────────┤ │ │ ├─────────┼─────────┤
+/// │ 6 │ 2 │ ◄──┐ │ └─ │ A │ 2 │◄───
logical_offset
+/// └─────────┴─────────┘ │ │ ├─────────┼─────────┤
+/// run-ends index │ └─── │ B │ 3 │
+/// │ ├─────────┼─────────┤
+/// logical_offset = 2 └───── │ C │ 4 │
+/// logical_length = 3 ├─────────┼─────────┤
+/// C 5 ◄───
logical_offset + logical_length
+/// └ ─ ─ ─ ─ ┴ ─ ─ ─ ─ ┘
+/// values index
+/// ```
///
-/// For example, a [`RunEndBuffer`] containing values `[3, 6, 8]` with offset
and length `4` would
-/// describe the physical indices `1, 1, 2, 2`
+/// The physical `run_ends` [`ScalarBuffer`] remains unchanged, in order to
facilitate
+/// zero-copy. However, we now offset into the **logical** representation with
an
+/// accompanying length. This allows us to represent values `[A, B, C]` using
physical
+/// indices `0, 1, 2` with the same underlying physical buffer, at the cost of
two
+/// extra `usize`s to represent the logical slice that was taken.
///
-/// For example, a [`RunEndBuffer`] containing values `[6, 8, 9]` with offset
`2` and length `5`
-/// would describe the physical indices `0, 0, 0, 0, 1`
+/// (A [`RunEndBuffer`] is considered unsliced when `logical_offset` is `0` and
+/// `logical_length` is equal to the last value in `run_ends`)
///
+/// [`RunArray`]: https://docs.rs/arrow/latest/arrow/array/struct.RunArray.html
/// [Run-End encoded layout]:
https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout
#[derive(Debug, Clone)]
pub struct RunEndBuffer<E: ArrowNativeType> {
run_ends: ScalarBuffer<E>,
- len: usize,
- offset: usize,
+ logical_length: usize,
+ logical_offset: usize,
}
impl<E> RunEndBuffer<E>
where
E: ArrowNativeType,
{
- /// Create a new [`RunEndBuffer`] from a [`ScalarBuffer`], an `offset` and
`len`
+ /// Create a new [`RunEndBuffer`] from a [`ScalarBuffer`], `logical_offset`
+ /// and `logical_length`.
///
/// # Panics
///
- /// - `buffer` does not contain strictly increasing values greater than
zero
- /// - the last value of `buffer` is less than `offset + len`
- pub fn new(run_ends: ScalarBuffer<E>, offset: usize, len: usize) -> Self {
+ /// - `run_ends` does not contain strictly increasing values greater than
zero
+ /// - The last value of `run_ends` is less than `logical_offset +
logical_length`
+ pub fn new(run_ends: ScalarBuffer<E>, logical_offset: usize,
logical_length: usize) -> Self {
assert!(
run_ends.windows(2).all(|w| w[0] < w[1]),
"run-ends not strictly increasing"
);
- if len != 0 {
+ if logical_length != 0 {
assert!(!run_ends.is_empty(), "non-empty slice but empty
run-ends");
- let end = E::from_usize(offset.saturating_add(len)).unwrap();
+ let end =
E::from_usize(logical_offset.saturating_add(logical_length)).unwrap();
assert!(
*run_ends.first().unwrap() > E::usize_as(0),
"run-ends not greater than 0"
@@ -99,41 +132,46 @@ where
Self {
run_ends,
- offset,
- len,
+ logical_offset,
+ logical_length,
}
}
- /// Create a new [`RunEndBuffer`] from an [`ScalarBuffer`], an `offset`
and `len`
+ /// Create a new [`RunEndBuffer`] from a [`ScalarBuffer`], `logical_offset`
+ /// and `logical_length`.
///
/// # Safety
///
- /// - `buffer` must contain strictly increasing values greater than zero
- /// - The last value of `buffer` must be greater than or equal to `offset
+ len`
- pub unsafe fn new_unchecked(run_ends: ScalarBuffer<E>, offset: usize, len:
usize) -> Self {
+ /// - `run_ends` must contain strictly increasing values greater than zero
+ /// - The last value of `run_ends` must be greater than or equal to
`logical_offset + logical_len`
+ pub unsafe fn new_unchecked(
+ run_ends: ScalarBuffer<E>,
+ logical_offset: usize,
+ logical_length: usize,
+ ) -> Self {
Self {
run_ends,
- offset,
- len,
+ logical_offset,
+ logical_length,
}
}
- /// Returns the logical offset into the run-ends stored by this buffer
+ /// Returns the logical offset into the run-ends stored by this buffer.
#[inline]
pub fn offset(&self) -> usize {
- self.offset
+ self.logical_offset
}
- /// Returns the logical length of the run-ends stored by this buffer
+ /// Returns the logical length of the run-ends stored by this buffer.
#[inline]
pub fn len(&self) -> usize {
- self.len
+ self.logical_length
}
- /// Returns true if this buffer is empty
+ /// Returns true if this buffer is logically empty.
#[inline]
pub fn is_empty(&self) -> bool {
- self.len == 0
+ self.logical_length == 0
}
/// Free up unused memory.
@@ -142,23 +180,32 @@ where
self.run_ends.shrink_to_fit();
}
- /// Returns the values of this [`RunEndBuffer`] not including any offset
+ /// Returns the physical (**unsliced**) run ends of this buffer.
+ ///
+ /// Take care when operating on these values as it doesn't take into
account
+ /// any logical slicing that may have occurred.
#[inline]
pub fn values(&self) -> &[E] {
&self.run_ends
}
- /// Returns the maximum run-end encoded in the underlying buffer
+ /// Returns the maximum run-end encoded in the underlying buffer; that is,
the
+ /// last physical run of the buffer. This does not take into account any
logical
+ /// slicing that may have occurred.
#[inline]
pub fn max_value(&self) -> usize {
self.values().last().copied().unwrap_or_default().as_usize()
}
- /// Performs a binary search to find the physical index for the given
logical index
+ /// Performs a binary search to find the physical index for the given
logical
+ /// index.
+ ///
+ /// Useful for extracting the corresponding physical `run_ends` when this
buffer
+ /// is logically sliced.
///
- /// The result is arbitrary if `logical_index >= self.len()`
+ /// The result is arbitrary if `logical_index >= self.len()`.
pub fn get_physical_index(&self, logical_index: usize) -> usize {
- let logical_index = E::usize_as(self.offset + logical_index);
+ let logical_index = E::usize_as(self.logical_offset + logical_index);
let cmp = |p: &E| p.partial_cmp(&logical_index).unwrap();
match self.run_ends.binary_search_by(cmp) {
@@ -167,46 +214,57 @@ where
}
}
- /// Returns the physical index at which the logical array starts
+ /// Returns the physical index at which the logical array starts.
+ ///
+ /// The same as calling `get_physical_index(0)` but with a fast path if the
+ /// buffer is not logically sliced, in which case it always returns `0`.
pub fn get_start_physical_index(&self) -> usize {
- if self.offset == 0 || self.len == 0 {
+ if self.logical_offset == 0 || self.logical_length == 0 {
return 0;
}
// Fallback to binary search
self.get_physical_index(0)
}
- /// Returns the physical index at which the logical array ends
+ /// Returns the physical index at which the logical array ends.
+ ///
+ /// The same as calling `get_physical_index(length - 1)` but with a fast
path
+ /// if the buffer is not logically sliced, in which case it returns
`length - 1`.
pub fn get_end_physical_index(&self) -> usize {
- if self.len == 0 {
+ if self.logical_length == 0 {
return 0;
}
- if self.max_value() == self.offset + self.len {
+ if self.max_value() == self.logical_offset + self.logical_length {
return self.values().len() - 1;
}
// Fallback to binary search
- self.get_physical_index(self.len - 1)
+ self.get_physical_index(self.logical_length - 1)
}
- /// Slices this [`RunEndBuffer`] by the provided `offset` and `length`
- pub fn slice(&self, offset: usize, len: usize) -> Self {
+ /// Slices this [`RunEndBuffer`] by the provided `logical_offset` and
`logical_length`.
+ ///
+ /// # Panics
+ ///
+ /// - Specified slice (`logical_offset` + `logical_length`) exceeds
existing
+ /// logical length
+ pub fn slice(&self, logical_offset: usize, logical_length: usize) -> Self {
assert!(
- offset.saturating_add(len) <= self.len,
+ logical_offset.saturating_add(logical_length) <=
self.logical_length,
"the length + offset of the sliced RunEndBuffer cannot exceed the
existing length"
);
Self {
run_ends: self.run_ends.clone(),
- offset: self.offset + offset,
- len,
+ logical_offset: self.logical_offset + logical_offset,
+ logical_length,
}
}
- /// Returns the inner [`ScalarBuffer`]
+ /// Returns the inner [`ScalarBuffer`].
pub fn inner(&self) -> &ScalarBuffer<E> {
&self.run_ends
}
- /// Returns the inner [`ScalarBuffer`], consuming self
+ /// Returns the inner [`ScalarBuffer`], consuming self.
pub fn into_inner(self) -> ScalarBuffer<E> {
self.run_ends
}
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index ac6795a3c6..67fd02837d 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -11745,6 +11745,23 @@ mod tests {
);
}
+ #[test]
+ #[should_panic = "assertion `left == right` failed\n left:
ScalarBuffer([1, 1, 2])\n right: [2, 2, 3]"]
+ // TODO: fix cast of RunArrays to account for sliced RunArray's
+ // https://github.com/apache/arrow-rs/issues/9018
+ fn test_sliced_run_end_encoded_to_primitive() {
+ let run_ends = Int32Array::from(vec![2, 5, 6]);
+ let values = Int32Array::from(vec![1, 2, 3]);
+ // [1, 1, 2, 2, 2, 3]
+ let run_array = RunArray::<Int32Type>::try_new(&run_ends,
&values).unwrap();
+ let run_array = run_array.slice(3, 3); // [2, 2, 3]
+ let array_ref = Arc::new(run_array) as ArrayRef;
+
+ let cast_result = cast(&array_ref, &DataType::Int64).unwrap();
+ let result_run_array = cast_result.as_primitive::<Int64Type>();
+ assert_eq!(result_run_array.values(), &[2, 2, 3]);
+ }
+
#[test]
fn test_run_end_encoded_to_string() {
let run_ends = Int32Array::from(vec![2, 3, 5]);
diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs
index 81b24827e3..84c41b6e16 100644
--- a/arrow-select/src/concat.rs
+++ b/arrow-select/src/concat.rs
@@ -1715,6 +1715,31 @@ mod tests {
assert_eq!(&[10, 20, 30, 40], values.values());
}
+ #[test]
+ #[should_panic = "assertion `left == right` failed\n left: [20, 20, 40,
40, 40]\n right: [10, 10, 20, 20, 30, 40, 40, 40]"]
+ // TODO: fix concat of RunArrays to account for sliced RunArray's
+ // https://github.com/apache/arrow-rs/issues/9018
+ fn test_concat_sliced_run_array() {
+ // Slicing away first run in both arrays
+ let run_ends1 = Int32Array::from(vec![2, 4]);
+ let values1 = Int32Array::from(vec![10, 20]);
+ let array1 = RunArray::try_new(&run_ends1, &values1).unwrap(); // [10,
10, 20, 20]
+ let array1 = array1.slice(2, 2); // [20, 20]
+
+ let run_ends2 = Int32Array::from(vec![1, 4]);
+ let values2 = Int32Array::from(vec![30, 40]);
+ let array2 = RunArray::try_new(&run_ends2, &values2).unwrap(); // [30,
40, 40, 40]
+ let array2 = array2.slice(1, 3); // [40, 40, 40]
+
+ let result = concat(&[&array1, &array2]).unwrap();
+ let result = result.as_run::<Int32Type>();
+ let result = result.downcast::<Int32Array>().unwrap();
+
+ let expected = vec![20, 20, 40, 40, 40];
+ let actual = result.into_iter().flatten().collect::<Vec<_>>();
+ assert_eq!(expected, actual);
+ }
+
#[test]
fn test_concat_run_array_matching_first_last_value() {
// Create a run array with run ends [2, 4, 7] and values [10, 20, 30]
diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs
index 6153845e8c..1aa933ae18 100644
--- a/arrow-select/src/filter.rs
+++ b/arrow-select/src/filter.rs
@@ -1354,6 +1354,26 @@ mod tests {
assert_eq!(actual.values(), expected.values())
}
+ #[test]
+ #[should_panic = "assertion `left == right` failed\n left: [-2, 9]\n
right: [7, -2]"]
+ // TODO: fix filter of RunArrays to account for sliced RunArray's
+ // https://github.com/apache/arrow-rs/issues/9018
+ fn test_filter_run_end_encoding_array_sliced() {
+ let run_ends = Int64Array::from(vec![2, 3, 8]);
+ let values = Int64Array::from(vec![7, -2, 9]);
+ let a = RunArray::try_new(&run_ends, &values).unwrap(); // [7, 7, -2,
9, 9, 9, 9, 9]
+ let a = a.slice(2, 3); // [-2, 9, 9]
+ let b = BooleanArray::from(vec![true, false, true]);
+ let result = filter(&a, &b).unwrap();
+
+ let result = result.as_run::<Int64Type>();
+ let result = result.downcast::<Int64Array>().unwrap();
+
+ let expected = vec![-2, 9];
+ let actual = result.into_iter().flatten().collect::<Vec<_>>();
+ assert_eq!(expected, actual);
+ }
+
#[test]
fn test_filter_run_end_encoding_array_remove_value() {
let run_ends = Int32Array::from(vec![2, 3, 8, 10]);
diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs
index f870bc8c2f..7a04c83f79 100644
--- a/arrow-select/src/interleave.rs
+++ b/arrow-select/src/interleave.rs
@@ -1171,6 +1171,32 @@ mod tests {
assert_eq!(actual, expected);
}
+ #[test]
+ #[should_panic = "assertion `left == right` failed\n left: [1, 4, 2, 5,
6]\n right: [2, 5, 2, 5, 6]"]
+ // TODO: fix interleave of RunArrays to account for sliced RunArray's
+ // https://github.com/apache/arrow-rs/issues/9018
+ fn test_interleave_run_end_encoded_sliced() {
+ let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
+ builder.extend([1, 1, 2, 2, 2, 3].into_iter().map(Some));
+ let a = builder.finish();
+ let a = a.slice(2, 3); // [2, 2, 2]
+
+ let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
+ builder.extend([4, 5, 5, 6, 6, 6].into_iter().map(Some));
+ let b = builder.finish();
+ let b = b.slice(1, 3); // [5, 5, 6]
+
+ let indices = &[(0, 1), (1, 0), (0, 3), (1, 2), (1, 3)];
+ let result = interleave(&[&a, &b], indices).unwrap();
+
+ let result = result.as_run::<Int32Type>();
+ let result = result.downcast::<Int32Array>().unwrap();
+
+ let expected = vec![2, 5, 2, 5, 6];
+ let actual = result.into_iter().flatten().collect::<Vec<_>>();
+ assert_eq!(actual, expected);
+ }
+
#[test]
fn test_interleave_run_end_encoded_string() {
let a: Int32RunArray = vec!["hello", "hello", "world", "world", "foo"]
diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs
index 7459018ea4..7f7791a07a 100644
--- a/arrow-select/src/take.rs
+++ b/arrow-select/src/take.rs
@@ -2438,6 +2438,27 @@ mod tests {
assert_eq!(take_out_values.values(), &[2, 2, 2, 2, 1]);
}
+ #[test]
+ fn test_take_runs_sliced() {
+ let logical_array: Vec<i32> = vec![1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6,
6];
+
+ let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
+ builder.extend(logical_array.into_iter().map(Some));
+ let run_array = builder.finish();
+
+ let run_array = run_array.slice(4, 6); // [3, 3, 3, 4, 4, 5]
+
+ let take_indices: PrimitiveArray<Int32Type> = vec![0, 5, 5, 1,
4].into_iter().collect();
+
+ let result = take_run(&run_array, &take_indices).unwrap();
+ let result = result.downcast::<Int32Array>().unwrap();
+
+ let expected = vec![3, 5, 5, 3, 4];
+ let actual = result.into_iter().flatten().collect::<Vec<_>>();
+
+ assert_eq!(expected, actual);
+ }
+
#[test]
fn test_take_value_index_from_fixed_list() {
let list = FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(