(arrow-rs) branch main updated: Improve `RunArray` documentation (#9019)

jeffreyvo Sun, 21 Dec 2025 06:08:45 -0800

This is an automated email from the ASF dual-hosted git repository.

jeffreyvo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new addf74d72d Improve `RunArray` documentation (#9019)
addf74d72d is described below

commit addf74d72d88684dabac21822c22f35e803135c5
Author: Jeffrey Vo <[email protected]>
AuthorDate: Sun Dec 21 23:08:21 2025 +0900

    Improve `RunArray` documentation (#9019)
    
    # Which issue does this PR close?
    
    <!--
    We generally require a GitHub issue to be filed for all bug fixes and
    enhancements and this helps us generate change logs for our releases.
    You can link an issue to this PR using the GitHub syntax.
    -->
    
    N/A
    
    # Rationale for this change
    
    <!--
    Why are you proposing this change? If this is already explained clearly
    in the issue then this section is not needed.
    Explaining clearly why changes are proposed helps reviewers understand
    your changes and offer better suggestions for fixes.
    -->
    
    Whilst reviewing https://github.com/apache/datafusion/pull/18981 I found
    it very confusing trying to follow along the logic for accounting for
    slicing of RunArrays. Decided to see if I can improve the documentation
    around it, especially to make it clear slicing acts logically not
    physically.
    
    # What changes are included in this PR?
    
    <!--
    There is no need to duplicate the description in the issue here but it
    is sometimes worth providing a summary of the individual changes in this
    PR.
    -->
    
    Improve docstrings for RunArrays and RunEndBuffers.
    
    Also add some tests for kernel operations on sliced RunArrays which
    seemed to be missing.
    
    # Are these changes tested?
    
    <!--
    We typically require tests for all PRs in order to:
    1. Prevent the code from being accidentally broken by subsequent changes
    2. Serve as another way to document the expected behavior of the code
    
    If tests are not included in your PR, please explain why (for example,
    are they covered by existing tests)?
    -->
    
    Doc changes only. Added some (expected) failing tests to showcase #9018
    too.
    
    # Are there any user-facing changes?
    
    <!--
    If there are user-facing changes then we may require documentation to be
    updated before approving the PR.
    
    If there are any breaking changes to public APIs, please call them out.
    -->
    
    Doc changes only.
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-array/src/array/run_array.rs |  74 +++++++++-----
 arrow-buffer/src/buffer/run.rs     | 198 ++++++++++++++++++++++++-------------
 arrow-cast/src/cast/mod.rs         |  17 ++++
 arrow-select/src/concat.rs         |  25 +++++
 arrow-select/src/filter.rs         |  20 ++++
 arrow-select/src/interleave.rs     |  26 +++++
 arrow-select/src/take.rs           |  21 ++++
 7 files changed, 285 insertions(+), 96 deletions(-)

diff --git a/arrow-array/src/array/run_array.rs 
b/arrow-array/src/array/run_array.rs
index b0365bd6cd..ed68c10f61 100644
--- a/arrow-array/src/array/run_array.rs
+++ b/arrow-array/src/array/run_array.rs
@@ -30,16 +30,15 @@ use crate::{
     types::{Int16Type, Int32Type, Int64Type, RunEndIndexType},
 };
 
-/// An array of [run-end encoded 
values](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout)
+/// An array of [run-end encoded values].
 ///
-/// This encoding is variation on [run-length encoding 
(RLE)](https://en.wikipedia.org/wiki/Run-length_encoding)
-/// and is good for representing data containing same values repeated 
consecutively.
-///
-/// [`RunArray`] contains `run_ends` array and `values` array of same length.
-/// The `run_ends` array stores the indexes at which the run ends. The 
`values` array
-/// stores the value of each run. Below example illustrates how a logical 
array is represented in
-/// [`RunArray`]
+/// This encoding is variation on [run-length encoding (RLE)] and is good for 
representing
+/// data containing the same values repeated consecutively.
 ///
+/// A [`RunArray`] consists of a `run_ends` buffer and a `values` array of 
equivalent
+/// lengths. The `run_ends` buffer stores the indexes at which the run ends. 
The
+/// `values` array stores the corresponding value of each run. The below 
example
+/// illustrates how a logical array is represented by a [`RunArray`]:
 ///
 /// ```text
 /// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┐
@@ -60,6 +59,9 @@ use crate::{
 ///                                             Logical array
 ///                                                Contents
 /// ```
+///
+/// [run-end encoded values]: 
https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout
+/// [run-length encoding (RLE)]: 
https://en.wikipedia.org/wiki/Run-length_encoding
 pub struct RunArray<R: RunEndIndexType> {
     data_type: DataType,
     run_ends: RunEndBuffer<R::Native>,
@@ -77,8 +79,8 @@ impl<R: RunEndIndexType> Clone for RunArray<R> {
 }
 
 impl<R: RunEndIndexType> RunArray<R> {
-    /// Calculates the logical length of the array encoded
-    /// by the given run_ends array.
+    /// Calculates the logical length of the array encoded by treating the 
`run_ends`
+    /// array as if it were a [`RunEndBuffer`].
     pub fn logical_len(run_ends: &PrimitiveArray<R>) -> usize {
         let len = run_ends.len();
         if len == 0 {
@@ -87,9 +89,13 @@ impl<R: RunEndIndexType> RunArray<R> {
         run_ends.value(len - 1).as_usize()
     }
 
-    /// Attempts to create RunArray using given run_ends (index where a run 
ends)
-    /// and the values (value of the run). Returns an error if the given data 
is not compatible
-    /// with RunEndEncoded specification.
+    /// Attempts to create a [`RunArray`] using the given `run_ends` and 
`values`.
+    ///
+    /// # Errors
+    ///
+    /// - If `run_ends` and `values` have different lengths
+    /// - If `run_ends` has any null values
+    /// - If `run_ends` doesn't consist of strictly increasing positive 
integers
     pub fn try_new(run_ends: &PrimitiveArray<R>, values: &dyn Array) -> 
Result<Self, ArrowError> {
         let run_ends_type = run_ends.data_type().clone();
         let values_type = values.data_type().clone();
@@ -117,25 +123,29 @@ impl<R: RunEndIndexType> RunArray<R> {
         Ok(array_data.into())
     }
 
-    /// Returns a reference to [`RunEndBuffer`]
+    /// Returns a reference to the [`RunEndBuffer`].
     pub fn run_ends(&self) -> &RunEndBuffer<R::Native> {
         &self.run_ends
     }
 
-    /// Returns a reference to values array
+    /// Returns a reference to the values array.
     ///
-    /// Note: any slicing of this [`RunArray`] array is not applied to the 
returned array
-    /// and must be handled separately
+    /// Any slicing of this [`RunArray`] array is **not** applied to the 
returned
+    /// values here and must be handled separately.
     pub fn values(&self) -> &ArrayRef {
         &self.values
     }
 
     /// Returns the physical index at which the array slice starts.
+    ///
+    /// See [`RunEndBuffer::get_start_physical_index`].
     pub fn get_start_physical_index(&self) -> usize {
         self.run_ends.get_start_physical_index()
     }
 
     /// Returns the physical index at which the array slice ends.
+    ///
+    /// See [`RunEndBuffer::get_end_physical_index`].
     pub fn get_end_physical_index(&self) -> usize {
         self.run_ends.get_end_physical_index()
     }
@@ -152,7 +162,6 @@ impl<R: RunEndIndexType> RunArray<R> {
     /// assert_eq!(typed.value(1), "b");
     /// assert!(typed.values().is_null(2));
     /// ```
-    ///
     pub fn downcast<V: 'static>(&self) -> Option<TypedRunArray<'_, R, V>> {
         let values = self.values.as_any().downcast_ref()?;
         Some(TypedRunArray {
@@ -161,22 +170,31 @@ impl<R: RunEndIndexType> RunArray<R> {
         })
     }
 
-    /// Returns index to the physical array for the given index to the logical 
array.
-    /// This function adjusts the input logical index based on 
`ArrayData::offset`
-    /// Performs a binary search on the run_ends array for the input index.
+    /// Calls [`RunEndBuffer::get_physical_index`].
     ///
     /// The result is arbitrary if `logical_index >= self.len()`
     pub fn get_physical_index(&self, logical_index: usize) -> usize {
         self.run_ends.get_physical_index(logical_index)
     }
 
-    /// Returns the physical indices of the input logical indices. Returns 
error if any of the logical
-    /// index cannot be converted to physical index. The logical indices are 
sorted and iterated along
-    /// with run_ends array to find matching physical index. The approach used 
here was chosen over
-    /// finding physical index for each logical index using binary search 
using the function
-    /// `get_physical_index`. Running benchmarks on both approaches showed 
that the approach used here
+    /// Given the input `logical_indices`, return the corresponding physical 
index
+    /// for each, according to the underlying [`RunEndBuffer`], taking into 
account
+    /// any slicing that has occurred.
+    ///
+    /// Returns an error if any of the provided logical indices is out of 
range.
+    ///
+    /// # Implementation
+    ///
+    /// The logical indices are sorted and iterated along with the `run_ends` 
buffer
+    /// to find the matching physical index. The approach used here was chosen 
over
+    /// finding the physical index for each logical index using binary search 
via
+    /// the function [`RunEndBuffer::get_physical_index`].
+    ///
+    /// Running benchmarks on both approaches showed that the approach used 
here
     /// scaled well for larger inputs.
+    ///
     /// See 
<https://github.com/apache/arrow-rs/pull/3622#issuecomment-1407753727> for more 
details.
+    // TODO: this technically should be a method on RunEndBuffer
     #[inline]
     pub fn get_physical_indices<I>(&self, logical_indices: &[I]) -> 
Result<Vec<usize>, ArrowError>
     where
@@ -244,6 +262,10 @@ impl<R: RunEndIndexType> RunArray<R> {
     }
 
     /// Returns a zero-copy slice of this array with the indicated offset and 
length.
+    ///
+    /// # Panics
+    ///
+    /// - Specified slice (`offset` + `length`) exceeds existing length
     pub fn slice(&self, offset: usize, length: usize) -> Self {
         Self {
             data_type: self.data_type.clone(),
diff --git a/arrow-buffer/src/buffer/run.rs b/arrow-buffer/src/buffer/run.rs
index 61fff6692e..8cb0f8ba4f 100644
--- a/arrow-buffer/src/buffer/run.rs
+++ b/arrow-buffer/src/buffer/run.rs
@@ -18,75 +18,108 @@
 use crate::ArrowNativeType;
 use crate::buffer::ScalarBuffer;
 
-/// A slice-able buffer of monotonically increasing, positive integers used to 
store run-ends
+/// A buffer of monotonically increasing, positive integers used to store 
run-ends.
 ///
-/// # Logical vs Physical
-///
-/// A [`RunEndBuffer`] is used to encode runs of the same value, the index of 
each run is
-/// called the physical index. The logical index is then the corresponding 
index in the logical
-/// run-encoded array, i.e. a single run of length `3`, would have the logical 
indices `0..3`.
+/// Used to compactly represent runs of the same value. Values being 
represented
+/// are stored in a separate buffer from this struct. See [`RunArray`] for an 
example
+/// of how this is used with a companion array to represent the values.
 ///
-/// Each value in [`RunEndBuffer::values`] is the cumulative length of all 
runs in the
-/// logical array, up to that physical index.
+/// # Logical vs Physical
 ///
-/// Consider a [`RunEndBuffer`] containing `[3, 4, 6]`. The maximum physical 
index is `2`,
-/// as there are `3` values, and the maximum logical index is `5`, as the 
maximum run end
-/// is `6`. The physical indices are therefore `[0, 0, 0, 1, 2, 2]`
+/// Physically, each value in the `run_ends` buffer is the cumulative length of
+/// all runs in the logical representation, up to that physical index. Consider
+/// the following example:
 ///
 /// ```text
-///     ┌─────────┐        ┌─────────┐           ┌─────────┐
-///     │    3    │        │    0    │ ─┬──────▶ │    0    │
-///     ├─────────┤        ├─────────┤  │        ├─────────┤
-///     │    4    │        │    1    │ ─┤ ┌────▶ │    1    │
-///     ├─────────┤        ├─────────┤  │ │      ├─────────┤
-///     │    6    │        │    2    │ ─┘ │ ┌──▶ │    2    │
-///     └─────────┘        ├─────────┤    │ │    └─────────┘
-///      run ends          │    3    │ ───┘ │  physical indices
-///                        ├─────────┤      │
-///                        │    4    │ ─────┤
-///                        ├─────────┤      │
-///                        │    5    │ ─────┘
-///                        └─────────┘
-///                      logical indices
+///           physical                        logical
+///     ┌─────────┬─────────┐           ┌─────────┬─────────┐
+///     │    3    │    0    │ ◄──────┬─ │    A    │    0    │
+///     ├─────────┼─────────┤        │  ├─────────┼─────────┤
+///     │    4    │    1    │ ◄────┐ ├─ │    A    │    1    │
+///     ├─────────┼─────────┤      │ │  ├─────────┼─────────┤
+///     │    6    │    2    │ ◄──┐ │ └─ │    A    │    2    │
+///     └─────────┴─────────┘    │ │    ├─────────┼─────────┤
+///      run-ends    index       │ └─── │    B    │    3    │
+///                              │      ├─────────┼─────────┤
+///      logical_offset = 0      ├───── │    C    │    4    │
+///      logical_length = 6      │      ├─────────┼─────────┤
+///                              └───── │    C    │    5    │
+///                                     └─────────┴─────────┘
+///                                       values     index
 /// ```
 ///
+/// A [`RunEndBuffer`] is physically the buffer and offset with length on the 
left.
+/// In this case, the offset and length represent the whole buffer, so it is 
essentially
+/// unsliced. See the section below on slicing for more details on how this 
buffer
+/// handles slicing.
+///
+/// This means that multiple logical values are represented in the same 
physical index,
+/// and multiple logical indices map to the same physical index. The 
[`RunEndBuffer`]
+/// containing `[3, 4, 6]` is essentially the physical indices `[0, 0, 0, 1, 
2, 2]`,
+/// and having a separately stored buffer of values such as `[A, B, C]` can 
turn
+/// this into a representation of `[A, A, A, B, C, C]`.
+///
 /// # Slicing
 ///
-/// In order to provide zero-copy slicing, this container stores a separate 
offset and length
+/// In order to provide zero-copy slicing, this struct stores a separate 
**logical**
+/// offset and length. Consider the following example:
+///
+/// ```text
+///           physical                        logical
+///     ┌─────────┬─────────┐           ┌ ─ ─ ─ ─ ┬ ─ ─ ─ ─ ┐
+///     │    3    │    0    │ ◄──────┐       A         0
+///     ├─────────┼─────────┤        │  ├── ─ ─ ─ ┼ ─ ─ ─ ─ ┤
+///     │    4    │    1    │ ◄────┐ │       A         1
+///     ├─────────┼─────────┤      │ │  ├─────────┼─────────┤
+///     │    6    │    2    │ ◄──┐ │ └─ │    A    │    2    │◄─── 
logical_offset
+///     └─────────┴─────────┘    │ │    ├─────────┼─────────┤
+///      run-ends    index       │ └─── │    B    │    3    │
+///                              │      ├─────────┼─────────┤
+///      logical_offset = 2      └───── │    C    │    4    │
+///      logical_length = 3             ├─────────┼─────────┤
+///                                          C         5     ◄─── 
logical_offset + logical_length
+///                                     └ ─ ─ ─ ─ ┴ ─ ─ ─ ─ ┘
+///                                       values     index
+/// ```
 ///
-/// For example, a [`RunEndBuffer`] containing values `[3, 6, 8]` with offset 
and length `4` would
-/// describe the physical indices `1, 1, 2, 2`
+/// The physical `run_ends` [`ScalarBuffer`] remains unchanged, in order to 
facilitate
+/// zero-copy. However, we now offset into the **logical** representation with 
an
+/// accompanying length. This allows us to represent values `[A, B, C]` using 
physical
+/// indices `0, 1, 2` with the same underlying physical buffer, at the cost of 
two
+/// extra `usize`s to represent the logical slice that was taken.
 ///
-/// For example, a [`RunEndBuffer`] containing values `[6, 8, 9]` with offset 
`2` and length `5`
-/// would describe the physical indices `0, 0, 0, 0, 1`
+/// (A [`RunEndBuffer`] is considered unsliced when `logical_offset` is `0` and
+/// `logical_length` is equal to the last value in `run_ends`)
 ///
+/// [`RunArray`]: https://docs.rs/arrow/latest/arrow/array/struct.RunArray.html
 /// [Run-End encoded layout]: 
https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout
 #[derive(Debug, Clone)]
 pub struct RunEndBuffer<E: ArrowNativeType> {
     run_ends: ScalarBuffer<E>,
-    len: usize,
-    offset: usize,
+    logical_length: usize,
+    logical_offset: usize,
 }
 
 impl<E> RunEndBuffer<E>
 where
     E: ArrowNativeType,
 {
-    /// Create a new [`RunEndBuffer`] from a [`ScalarBuffer`], an `offset` and 
`len`
+    /// Create a new [`RunEndBuffer`] from a [`ScalarBuffer`], `logical_offset`
+    /// and `logical_length`.
     ///
     /// # Panics
     ///
-    /// - `buffer` does not contain strictly increasing values greater than 
zero
-    /// - the last value of `buffer` is less than `offset + len`
-    pub fn new(run_ends: ScalarBuffer<E>, offset: usize, len: usize) -> Self {
+    /// - `run_ends` does not contain strictly increasing values greater than 
zero
+    /// - The last value of `run_ends` is less than `logical_offset + 
logical_length`
+    pub fn new(run_ends: ScalarBuffer<E>, logical_offset: usize, 
logical_length: usize) -> Self {
         assert!(
             run_ends.windows(2).all(|w| w[0] < w[1]),
             "run-ends not strictly increasing"
         );
 
-        if len != 0 {
+        if logical_length != 0 {
             assert!(!run_ends.is_empty(), "non-empty slice but empty 
run-ends");
-            let end = E::from_usize(offset.saturating_add(len)).unwrap();
+            let end = 
E::from_usize(logical_offset.saturating_add(logical_length)).unwrap();
             assert!(
                 *run_ends.first().unwrap() > E::usize_as(0),
                 "run-ends not greater than 0"
@@ -99,41 +132,46 @@ where
 
         Self {
             run_ends,
-            offset,
-            len,
+            logical_offset,
+            logical_length,
         }
     }
 
-    /// Create a new [`RunEndBuffer`] from an [`ScalarBuffer`], an `offset` 
and `len`
+    /// Create a new [`RunEndBuffer`] from a [`ScalarBuffer`], `logical_offset`
+    /// and `logical_length`.
     ///
     /// # Safety
     ///
-    /// - `buffer` must contain strictly increasing values greater than zero
-    /// - The last value of `buffer` must be greater than or equal to `offset 
+ len`
-    pub unsafe fn new_unchecked(run_ends: ScalarBuffer<E>, offset: usize, len: 
usize) -> Self {
+    /// - `run_ends` must contain strictly increasing values greater than zero
+    /// - The last value of `run_ends` must be greater than or equal to 
`logical_offset + logical_len`
+    pub unsafe fn new_unchecked(
+        run_ends: ScalarBuffer<E>,
+        logical_offset: usize,
+        logical_length: usize,
+    ) -> Self {
         Self {
             run_ends,
-            offset,
-            len,
+            logical_offset,
+            logical_length,
         }
     }
 
-    /// Returns the logical offset into the run-ends stored by this buffer
+    /// Returns the logical offset into the run-ends stored by this buffer.
     #[inline]
     pub fn offset(&self) -> usize {
-        self.offset
+        self.logical_offset
     }
 
-    /// Returns the logical length of the run-ends stored by this buffer
+    /// Returns the logical length of the run-ends stored by this buffer.
     #[inline]
     pub fn len(&self) -> usize {
-        self.len
+        self.logical_length
     }
 
-    /// Returns true if this buffer is empty
+    /// Returns true if this buffer is logically empty.
     #[inline]
     pub fn is_empty(&self) -> bool {
-        self.len == 0
+        self.logical_length == 0
     }
 
     /// Free up unused memory.
@@ -142,23 +180,32 @@ where
         self.run_ends.shrink_to_fit();
     }
 
-    /// Returns the values of this [`RunEndBuffer`] not including any offset
+    /// Returns the physical (**unsliced**) run ends of this buffer.
+    ///
+    /// Take care when operating on these values as it doesn't take into 
account
+    /// any logical slicing that may have occurred.
     #[inline]
     pub fn values(&self) -> &[E] {
         &self.run_ends
     }
 
-    /// Returns the maximum run-end encoded in the underlying buffer
+    /// Returns the maximum run-end encoded in the underlying buffer; that is, 
the
+    /// last physical run of the buffer. This does not take into account any 
logical
+    /// slicing that may have occurred.
     #[inline]
     pub fn max_value(&self) -> usize {
         self.values().last().copied().unwrap_or_default().as_usize()
     }
 
-    /// Performs a binary search to find the physical index for the given 
logical index
+    /// Performs a binary search to find the physical index for the given 
logical
+    /// index.
+    ///
+    /// Useful for extracting the corresponding physical `run_ends` when this 
buffer
+    /// is logically sliced.
     ///
-    /// The result is arbitrary if `logical_index >= self.len()`
+    /// The result is arbitrary if `logical_index >= self.len()`.
     pub fn get_physical_index(&self, logical_index: usize) -> usize {
-        let logical_index = E::usize_as(self.offset + logical_index);
+        let logical_index = E::usize_as(self.logical_offset + logical_index);
         let cmp = |p: &E| p.partial_cmp(&logical_index).unwrap();
 
         match self.run_ends.binary_search_by(cmp) {
@@ -167,46 +214,57 @@ where
         }
     }
 
-    /// Returns the physical index at which the logical array starts
+    /// Returns the physical index at which the logical array starts.
+    ///
+    /// The same as calling `get_physical_index(0)` but with a fast path if the
+    /// buffer is not logically sliced, in which case it always returns `0`.
     pub fn get_start_physical_index(&self) -> usize {
-        if self.offset == 0 || self.len == 0 {
+        if self.logical_offset == 0 || self.logical_length == 0 {
             return 0;
         }
         // Fallback to binary search
         self.get_physical_index(0)
     }
 
-    /// Returns the physical index at which the logical array ends
+    /// Returns the physical index at which the logical array ends.
+    ///
+    /// The same as calling `get_physical_index(length - 1)` but with a fast 
path
+    /// if the buffer is not logically sliced, in which case it returns 
`length - 1`.
     pub fn get_end_physical_index(&self) -> usize {
-        if self.len == 0 {
+        if self.logical_length == 0 {
             return 0;
         }
-        if self.max_value() == self.offset + self.len {
+        if self.max_value() == self.logical_offset + self.logical_length {
             return self.values().len() - 1;
         }
         // Fallback to binary search
-        self.get_physical_index(self.len - 1)
+        self.get_physical_index(self.logical_length - 1)
     }
 
-    /// Slices this [`RunEndBuffer`] by the provided `offset` and `length`
-    pub fn slice(&self, offset: usize, len: usize) -> Self {
+    /// Slices this [`RunEndBuffer`] by the provided `logical_offset` and 
`logical_length`.
+    ///
+    /// # Panics
+    ///
+    /// - Specified slice (`logical_offset` + `logical_length`) exceeds 
existing
+    ///   logical length
+    pub fn slice(&self, logical_offset: usize, logical_length: usize) -> Self {
         assert!(
-            offset.saturating_add(len) <= self.len,
+            logical_offset.saturating_add(logical_length) <= 
self.logical_length,
             "the length + offset of the sliced RunEndBuffer cannot exceed the 
existing length"
         );
         Self {
             run_ends: self.run_ends.clone(),
-            offset: self.offset + offset,
-            len,
+            logical_offset: self.logical_offset + logical_offset,
+            logical_length,
         }
     }
 
-    /// Returns the inner [`ScalarBuffer`]
+    /// Returns the inner [`ScalarBuffer`].
     pub fn inner(&self) -> &ScalarBuffer<E> {
         &self.run_ends
     }
 
-    /// Returns the inner [`ScalarBuffer`], consuming self
+    /// Returns the inner [`ScalarBuffer`], consuming self.
     pub fn into_inner(self) -> ScalarBuffer<E> {
         self.run_ends
     }
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index ac6795a3c6..67fd02837d 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -11745,6 +11745,23 @@ mod tests {
         );
     }
 
+    #[test]
+    #[should_panic = "assertion `left == right` failed\n  left: 
ScalarBuffer([1, 1, 2])\n right: [2, 2, 3]"]
+    // TODO: fix cast of RunArrays to account for sliced RunArray's
+    // https://github.com/apache/arrow-rs/issues/9018
+    fn test_sliced_run_end_encoded_to_primitive() {
+        let run_ends = Int32Array::from(vec![2, 5, 6]);
+        let values = Int32Array::from(vec![1, 2, 3]);
+        // [1, 1, 2, 2, 2, 3]
+        let run_array = RunArray::<Int32Type>::try_new(&run_ends, 
&values).unwrap();
+        let run_array = run_array.slice(3, 3); // [2, 2, 3]
+        let array_ref = Arc::new(run_array) as ArrayRef;
+
+        let cast_result = cast(&array_ref, &DataType::Int64).unwrap();
+        let result_run_array = cast_result.as_primitive::<Int64Type>();
+        assert_eq!(result_run_array.values(), &[2, 2, 3]);
+    }
+
     #[test]
     fn test_run_end_encoded_to_string() {
         let run_ends = Int32Array::from(vec![2, 3, 5]);
diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs
index 81b24827e3..84c41b6e16 100644
--- a/arrow-select/src/concat.rs
+++ b/arrow-select/src/concat.rs
@@ -1715,6 +1715,31 @@ mod tests {
         assert_eq!(&[10, 20, 30, 40], values.values());
     }
 
+    #[test]
+    #[should_panic = "assertion `left == right` failed\n  left: [20, 20, 40, 
40, 40]\n right: [10, 10, 20, 20, 30, 40, 40, 40]"]
+    // TODO: fix concat of RunArrays to account for sliced RunArray's
+    // https://github.com/apache/arrow-rs/issues/9018
+    fn test_concat_sliced_run_array() {
+        // Slicing away first run in both arrays
+        let run_ends1 = Int32Array::from(vec![2, 4]);
+        let values1 = Int32Array::from(vec![10, 20]);
+        let array1 = RunArray::try_new(&run_ends1, &values1).unwrap(); // [10, 
10, 20, 20]
+        let array1 = array1.slice(2, 2); // [20, 20]
+
+        let run_ends2 = Int32Array::from(vec![1, 4]);
+        let values2 = Int32Array::from(vec![30, 40]);
+        let array2 = RunArray::try_new(&run_ends2, &values2).unwrap(); // [30, 
40, 40, 40]
+        let array2 = array2.slice(1, 3); // [40, 40, 40]
+
+        let result = concat(&[&array1, &array2]).unwrap();
+        let result = result.as_run::<Int32Type>();
+        let result = result.downcast::<Int32Array>().unwrap();
+
+        let expected = vec![20, 20, 40, 40, 40];
+        let actual = result.into_iter().flatten().collect::<Vec<_>>();
+        assert_eq!(expected, actual);
+    }
+
     #[test]
     fn test_concat_run_array_matching_first_last_value() {
         // Create a run array with run ends [2, 4, 7] and values [10, 20, 30]
diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs
index 6153845e8c..1aa933ae18 100644
--- a/arrow-select/src/filter.rs
+++ b/arrow-select/src/filter.rs
@@ -1354,6 +1354,26 @@ mod tests {
         assert_eq!(actual.values(), expected.values())
     }
 
+    #[test]
+    #[should_panic = "assertion `left == right` failed\n  left: [-2, 9]\n 
right: [7, -2]"]
+    // TODO: fix filter of RunArrays to account for sliced RunArray's
+    // https://github.com/apache/arrow-rs/issues/9018
+    fn test_filter_run_end_encoding_array_sliced() {
+        let run_ends = Int64Array::from(vec![2, 3, 8]);
+        let values = Int64Array::from(vec![7, -2, 9]);
+        let a = RunArray::try_new(&run_ends, &values).unwrap(); // [7, 7, -2, 
9, 9, 9, 9, 9]
+        let a = a.slice(2, 3); // [-2, 9, 9]
+        let b = BooleanArray::from(vec![true, false, true]);
+        let result = filter(&a, &b).unwrap();
+
+        let result = result.as_run::<Int64Type>();
+        let result = result.downcast::<Int64Array>().unwrap();
+
+        let expected = vec![-2, 9];
+        let actual = result.into_iter().flatten().collect::<Vec<_>>();
+        assert_eq!(expected, actual);
+    }
+
     #[test]
     fn test_filter_run_end_encoding_array_remove_value() {
         let run_ends = Int32Array::from(vec![2, 3, 8, 10]);
diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs
index f870bc8c2f..7a04c83f79 100644
--- a/arrow-select/src/interleave.rs
+++ b/arrow-select/src/interleave.rs
@@ -1171,6 +1171,32 @@ mod tests {
         assert_eq!(actual, expected);
     }
 
+    #[test]
+    #[should_panic = "assertion `left == right` failed\n  left: [1, 4, 2, 5, 
6]\n right: [2, 5, 2, 5, 6]"]
+    // TODO: fix interleave of RunArrays to account for sliced RunArray's
+    // https://github.com/apache/arrow-rs/issues/9018
+    fn test_interleave_run_end_encoded_sliced() {
+        let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
+        builder.extend([1, 1, 2, 2, 2, 3].into_iter().map(Some));
+        let a = builder.finish();
+        let a = a.slice(2, 3); // [2, 2, 2]
+
+        let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
+        builder.extend([4, 5, 5, 6, 6, 6].into_iter().map(Some));
+        let b = builder.finish();
+        let b = b.slice(1, 3); // [5, 5, 6]
+
+        let indices = &[(0, 1), (1, 0), (0, 3), (1, 2), (1, 3)];
+        let result = interleave(&[&a, &b], indices).unwrap();
+
+        let result = result.as_run::<Int32Type>();
+        let result = result.downcast::<Int32Array>().unwrap();
+
+        let expected = vec![2, 5, 2, 5, 6];
+        let actual = result.into_iter().flatten().collect::<Vec<_>>();
+        assert_eq!(actual, expected);
+    }
+
     #[test]
     fn test_interleave_run_end_encoded_string() {
         let a: Int32RunArray = vec!["hello", "hello", "world", "world", "foo"]
diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs
index 7459018ea4..7f7791a07a 100644
--- a/arrow-select/src/take.rs
+++ b/arrow-select/src/take.rs
@@ -2438,6 +2438,27 @@ mod tests {
         assert_eq!(take_out_values.values(), &[2, 2, 2, 2, 1]);
     }
 
+    #[test]
+    fn test_take_runs_sliced() {
+        let logical_array: Vec<i32> = vec![1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 
6];
+
+        let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
+        builder.extend(logical_array.into_iter().map(Some));
+        let run_array = builder.finish();
+
+        let run_array = run_array.slice(4, 6); // [3, 3, 3, 4, 4, 5]
+
+        let take_indices: PrimitiveArray<Int32Type> = vec![0, 5, 5, 1, 
4].into_iter().collect();
+
+        let result = take_run(&run_array, &take_indices).unwrap();
+        let result = result.downcast::<Int32Array>().unwrap();
+
+        let expected = vec![3, 5, 5, 3, 4];
+        let actual = result.into_iter().flatten().collect::<Vec<_>>();
+
+        assert_eq!(expected, actual);
+    }
+
     #[test]
     fn test_take_value_index_from_fixed_list() {
         let list = FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(

(arrow-rs) branch main updated: Improve `RunArray` documentation (#9019)

Reply via email to