This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new b6b542e87b perf: Optimize `array_positions()` for scalar needle 
(#20770)
b6b542e87b is described below

commit b6b542e87b84f4744096106bea0de755b2e70cc5
Author: Neil Conway <[email protected]>
AuthorDate: Wed Mar 18 07:35:32 2026 -0400

    perf: Optimize `array_positions()` for scalar needle (#20770)
    
    ## Which issue does this PR close?
    
    - Closes #20769.
    
    ## Rationale for this change
    
    `array_positions` previously compared the needle against each row's
    sub-array individually. When the needle is a scalar (the common case),
    we can do a single bulk `arrow_ord::cmp::not_distinct` comparison
    against the entire flat values buffer and then walk the result bitmap,
    which is significantly faster: the speedup on the `array_positions()`
    microbenchmarks ranges from 5x to 40x, depending on the size of the
    array.
    
    The same pattern has already been applied to `array_position` (#20532),
    and previously to other array UDFs.
    
    ## What changes are included in this PR?
    
    - Add benchmarks for `array_positions`.
    - Implement bulk-comparison optimization
    - Refactor `array_position`'s existing fast path slightly for
    consistency
    - Code cleanup to use "haystack" and "needle" consistently, not vague
    terms like "list_array" and "element"
    - Add unit tests for `array_positions` with sliced ListArrays, for peace
    of mind
    - Add unit tests for sliced lists and sliced lists with nulls for the
    new `array_positions` fast path.
    
    ## Are these changes tested?
    
    Yes.
    
    ## Are there any user-facing changes?
    
    No.
    
    ## AI usage
    
    Multiple AI tools were used to iterate on this PR. I have reviewed and
    understand the resulting code.
    
    ---------
    
    Co-authored-by: Oleks V <[email protected]>
---
 .../functions-nested/benches/array_position.rs     | 109 ++++-
 datafusion/functions-nested/src/position.rs        | 473 +++++++++++++++------
 datafusion/sqllogictest/test_files/array.slt       |   6 +
 docs/source/user-guide/sql/scalar_functions.md     |   2 +-
 4 files changed, 460 insertions(+), 130 deletions(-)

diff --git a/datafusion/functions-nested/benches/array_position.rs 
b/datafusion/functions-nested/benches/array_position.rs
index 0836764844..c718b2b725 100644
--- a/datafusion/functions-nested/benches/array_position.rs
+++ b/datafusion/functions-nested/benches/array_position.rs
@@ -24,7 +24,7 @@ use criterion::{
 use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
-use datafusion_functions_nested::position::ArrayPosition;
+use datafusion_functions_nested::position::{ArrayPosition, ArrayPositions};
 use rand::Rng;
 use rand::SeedableRng;
 use rand::rngs::StdRng;
@@ -39,6 +39,7 @@ const SENTINEL_NEEDLE: i64 = -1;
 fn criterion_benchmark(c: &mut Criterion) {
     for size in [10, 100, 500] {
         bench_array_position(c, size);
+        bench_array_positions(c, size);
     }
 }
 
@@ -146,6 +147,112 @@ fn bench_array_position(c: &mut Criterion, array_size: 
usize) {
     group.finish();
 }
 
+fn bench_array_positions(c: &mut Criterion, array_size: usize) {
+    let mut group = c.benchmark_group("array_positions_i64");
+    let haystack_found_once = create_haystack_with_sentinel(
+        NUM_ROWS,
+        array_size,
+        NULL_DENSITY,
+        SENTINEL_NEEDLE,
+        0,
+    );
+    let haystack_found_many = create_haystack_with_sentinels(
+        NUM_ROWS,
+        array_size,
+        NULL_DENSITY,
+        SENTINEL_NEEDLE,
+    );
+    let haystack_not_found =
+        create_haystack_without_sentinel(NUM_ROWS, array_size, NULL_DENSITY);
+    let num_rows = haystack_not_found.len();
+    let arg_fields: Vec<Arc<Field>> = vec![
+        Field::new("haystack", haystack_not_found.data_type().clone(), 
false).into(),
+        Field::new("needle", DataType::Int64, false).into(),
+    ];
+    let return_field: Arc<Field> = Field::new(
+        "result",
+        DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, 
true))),
+        true,
+    )
+    .into();
+    let config_options = Arc::new(ConfigOptions::default());
+    let needle = ScalarValue::Int64(Some(SENTINEL_NEEDLE));
+
+    let args_found_once = vec![
+        ColumnarValue::Array(haystack_found_once.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found_once", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPositions::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found_once.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    let args_found_many = vec![
+        ColumnarValue::Array(haystack_found_many.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("found_many", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPositions::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_found_many.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    let args_not_found = vec![
+        ColumnarValue::Array(haystack_not_found.clone()),
+        ColumnarValue::Scalar(needle.clone()),
+    ];
+    group.bench_with_input(
+        BenchmarkId::new("not_found", array_size),
+        &array_size,
+        |b, _| {
+            let udf = ArrayPositions::new();
+            b.iter(|| {
+                black_box(
+                    udf.invoke_with_args(ScalarFunctionArgs {
+                        args: args_not_found.clone(),
+                        arg_fields: arg_fields.clone(),
+                        number_rows: num_rows,
+                        return_field: return_field.clone(),
+                        config_options: config_options.clone(),
+                    })
+                    .unwrap(),
+                )
+            })
+        },
+    );
+
+    group.finish();
+}
+
 fn create_haystack_without_sentinel(
     num_rows: usize,
     array_size: usize,
diff --git a/datafusion/functions-nested/src/position.rs 
b/datafusion/functions-nested/src/position.rs
index 0214b1552b..acdeb202f9 100644
--- a/datafusion/functions-nested/src/position.rs
+++ b/datafusion/functions-nested/src/position.rs
@@ -18,6 +18,7 @@
 //! [`ScalarUDFImpl`] definitions for array_position and array_positions 
functions.
 
 use arrow::array::Scalar;
+use arrow::buffer::OffsetBuffer;
 use arrow::datatypes::DataType;
 use arrow::datatypes::{
     DataType::{LargeList, List, UInt64},
@@ -25,7 +26,8 @@ use arrow::datatypes::{
 };
 use datafusion_common::ScalarValue;
 use datafusion_expr::{
-    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
 };
 use datafusion_macros::user_doc;
 
@@ -122,57 +124,10 @@ impl ScalarUDFImpl for ArrayPosition {
         Ok(UInt64)
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let [first_arg, second_arg, third_arg @ ..] = args.args.as_slice() 
else {
-            return exec_err!("array_position expects two or three arguments");
-        };
-
-        match second_arg {
-            ColumnarValue::Scalar(scalar_element) => {
-                // Nested element types (List, Struct) can't use the fast path
-                // (because Arrow's `non_distinct` does not support them).
-                if scalar_element.data_type().is_nested() {
-                    return 
make_scalar_function(array_position_inner)(&args.args);
-                }
-
-                // Determine batch length from whichever argument is columnar;
-                // if all inputs are scalar, batch length is 1.
-                let (num_rows, all_inputs_scalar) = match (first_arg, 
third_arg.first()) {
-                    (ColumnarValue::Array(a), _) => (a.len(), false),
-                    (_, Some(ColumnarValue::Array(a))) => (a.len(), false),
-                    _ => (1, true),
-                };
-
-                let element_arr = scalar_element.to_array_of_size(1)?;
-                let haystack = first_arg.to_array(num_rows)?;
-                let arr_from = resolve_start_from(third_arg.first(), 
num_rows)?;
-
-                let result = match haystack.data_type() {
-                    List(_) => {
-                        let list = as_generic_list_array::<i32>(&haystack)?;
-                        array_position_scalar::<i32>(list, &element_arr, 
&arr_from)
-                    }
-                    LargeList(_) => {
-                        let list = as_generic_list_array::<i64>(&haystack)?;
-                        array_position_scalar::<i64>(list, &element_arr, 
&arr_from)
-                    }
-                    t => exec_err!("array_position does not support type 
'{t}'."),
-                }?;
-
-                if all_inputs_scalar {
-                    Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
-                        &result, 0,
-                    )?))
-                } else {
-                    Ok(ColumnarValue::Array(result))
-                }
-            }
-            ColumnarValue::Array(_) => {
-                make_scalar_function(array_position_inner)(&args.args)
-            }
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        match try_array_position_scalar(&args.args)? {
+            Some(result) => Ok(result),
+            None => make_scalar_function(array_position_inner)(&args.args),
         }
     }
 
@@ -185,6 +140,57 @@ impl ScalarUDFImpl for ArrayPosition {
     }
 }
 
+/// Attempts the scalar-needle fast path for `array_position`.
+fn try_array_position_scalar(args: &[ColumnarValue]) -> 
Result<Option<ColumnarValue>> {
+    if args.len() < 2 || args.len() > 3 {
+        return exec_err!("array_position expects two or three arguments");
+    }
+
+    // Fallback to the generic code path if the needle is an array
+    let scalar_needle = match &args[1] {
+        ColumnarValue::Scalar(s) => s,
+        ColumnarValue::Array(_) => return Ok(None),
+    };
+
+    // `not_distinct` doesn't support nested types (List, Struct, etc.),
+    // so fall back to the generic code path for those.
+    if scalar_needle.data_type().is_nested() {
+        return Ok(None);
+    }
+
+    // Determine batch length from whichever argument is columnar;
+    // if all inputs are scalar, batch length is 1.
+    let (num_rows, all_inputs_scalar) = match (&args[0], args.get(2)) {
+        (ColumnarValue::Array(a), _) => (a.len(), false),
+        (_, Some(ColumnarValue::Array(a))) => (a.len(), false),
+        _ => (1, true),
+    };
+
+    let needle = scalar_needle.to_array_of_size(1)?;
+    let haystack = args[0].to_array(num_rows)?;
+    let arr_from = resolve_start_from(args.get(2), num_rows)?;
+
+    let result = match haystack.data_type() {
+        List(_) => {
+            let list = as_list_array(&haystack)?;
+            array_position_scalar::<i32>(list, &needle, &arr_from)
+        }
+        LargeList(_) => {
+            let list = as_large_list_array(&haystack)?;
+            array_position_scalar::<i64>(list, &needle, &arr_from)
+        }
+        t => exec_err!("array_position does not support type '{t}'"),
+    }?;
+
+    if all_inputs_scalar {
+        Ok(Some(ColumnarValue::Scalar(ScalarValue::try_from_array(
+            &result, 0,
+        )?)))
+    } else {
+        Ok(Some(ColumnarValue::Array(result)))
+    }
+}
+
 fn array_position_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
     if args.len() < 2 || args.len() > 3 {
         return exec_err!("array_position expects two or three arguments");
@@ -192,7 +198,7 @@ fn array_position_inner(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     match &args[0].data_type() {
         List(_) => general_position_dispatch::<i32>(args),
         LargeList(_) => general_position_dispatch::<i64>(args),
-        array_type => exec_err!("array_position does not support type 
'{array_type}'."),
+        dt => exec_err!("array_position does not support type '{dt}'"),
     }
 }
 
@@ -216,48 +222,46 @@ fn resolve_start_from(
     }
 }
 
-/// Fast path for `array_position` when the element is a scalar.
+/// Fast path for `array_position` when the needle is scalar.
 ///
-/// Performs a single bulk `not_distinct` comparison of the scalar element
-/// against the entire flattened values buffer, then walks the result bitmap
-/// using offsets to find per-row first-match positions.
+/// Performs a single bulk `not_distinct` comparison of the needle against the
+/// entire flat values buffer, then walks the result bitmap using offsets to
+/// find per-row first-match positions.
 fn array_position_scalar<O: OffsetSizeTrait>(
-    list_array: &GenericListArray<O>,
-    element_array: &ArrayRef,
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
     arr_from: &[i64], // 0-indexed
 ) -> Result<ArrayRef> {
-    crate::utils::check_datatypes(
-        "array_position",
-        &[list_array.values(), element_array],
-    )?;
+    crate::utils::check_datatypes("array_position", &[haystack.values(), 
needle])?;
 
-    if list_array.len() == 0 {
+    if haystack.len() == 0 {
         return Ok(Arc::new(UInt64Array::new_null(0)));
     }
 
-    let element_datum = Scalar::new(Arc::clone(element_array));
-    let validity = list_array.nulls();
+    let needle_datum = Scalar::new(Arc::clone(needle));
+    let validity = haystack.nulls();
 
-    // Only compare the visible portion of the values buffer, which avoids
-    // wasted work for sliced ListArrays.
-    let offsets = list_array.offsets();
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let offsets = haystack.offsets();
     let first_offset = offsets[0].as_usize();
-    let last_offset = offsets[list_array.len()].as_usize();
-    let visible_values = list_array
+    let last_offset = offsets[haystack.len()].as_usize();
+    let visible_values = haystack
         .values()
         .slice(first_offset, last_offset - first_offset);
 
     // `not_distinct` treats NULL=NULL as true, matching the semantics of
-    // `array_position`
-    let eq_array = arrow_ord::cmp::not_distinct(&visible_values, 
&element_datum)?;
+    // `array_position`.
+    let eq_array = arrow_ord::cmp::not_distinct(&visible_values, 
&needle_datum)?;
     let eq_bits = eq_array.values();
 
-    let mut result: Vec<Option<u64>> = Vec::with_capacity(list_array.len());
+    let mut result: Vec<Option<u64>> = Vec::with_capacity(haystack.len());
     let mut matches = eq_bits.set_indices().peekable();
 
     // Match positions are relative to visible_values (0-based), so
     // subtract first_offset from each offset when comparing.
-    for i in 0..list_array.len() {
+    for i in 0..haystack.len() {
         let start = offsets[i].as_usize() - first_offset;
         let end = offsets[i + 1].as_usize() - first_offset;
 
@@ -295,18 +299,15 @@ fn array_position_scalar<O: OffsetSizeTrait>(
         }
     }
 
-    debug_assert_eq!(result.len(), list_array.len());
+    debug_assert_eq!(result.len(), haystack.len());
     Ok(Arc::new(UInt64Array::from(result)))
 }
 
 fn general_position_dispatch<O: OffsetSizeTrait>(args: &[ArrayRef]) -> 
Result<ArrayRef> {
-    let list_array = as_generic_list_array::<O>(&args[0])?;
-    let element_array = &args[1];
+    let haystack = as_generic_list_array::<O>(&args[0])?;
+    let needle = &args[1];
 
-    crate::utils::check_datatypes(
-        "array_position",
-        &[list_array.values(), element_array],
-    )?;
+    crate::utils::check_datatypes("array_position", &[haystack.values(), 
needle])?;
 
     let arr_from = if args.len() == 3 {
         as_int64_array(&args[2])?
@@ -315,34 +316,30 @@ fn general_position_dispatch<O: OffsetSizeTrait>(args: 
&[ArrayRef]) -> Result<Ar
             .map(|&x| x - 1)
             .collect::<Vec<_>>()
     } else {
-        vec![0; list_array.len()]
+        vec![0; haystack.len()]
     };
 
-    for (arr, &from) in list_array.iter().zip(arr_from.iter()) {
-        // If `arr` is `None`: we will get null if we got null in the array, 
so we don't need to check
-        if !arr.is_none_or(|arr| from >= 0 && (from as usize) <= arr.len()) {
+    for (row, &from) in haystack.iter().zip(arr_from.iter()) {
+        if !row.is_none_or(|row| from >= 0 && (from as usize) <= row.len()) {
             return exec_err!("start_from out of bounds: {}", from + 1);
         }
     }
 
-    generic_position::<O>(list_array, element_array, &arr_from)
+    generic_position::<O>(haystack, needle, &arr_from)
 }
 
-fn generic_position<OffsetSize: OffsetSizeTrait>(
-    list_array: &GenericListArray<OffsetSize>,
-    element_array: &ArrayRef,
+fn generic_position<O: OffsetSizeTrait>(
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
     arr_from: &[i64], // 0-indexed
 ) -> Result<ArrayRef> {
-    let mut data = Vec::with_capacity(list_array.len());
+    let mut data = Vec::with_capacity(haystack.len());
 
-    for (row_index, (list_array_row, &from)) in
-        list_array.iter().zip(arr_from.iter()).enumerate()
-    {
+    for (row_index, (row, &from)) in 
haystack.iter().zip(arr_from.iter()).enumerate() {
         let from = from as usize;
 
-        if let Some(list_array_row) = list_array_row {
-            let eq_array =
-                compare_element_to_list(&list_array_row, element_array, 
row_index, true)?;
+        if let Some(row) = row {
+            let eq_array = compare_element_to_list(&row, needle, row_index, 
true)?;
 
             // Collect `true`s in 1-indexed positions
             let index = eq_array
@@ -384,17 +381,20 @@ make_udf_expr_and_func!(
         name = "array",
         description = "Array expression. Can be a constant, column, or 
function, and any combination of array operators."
     ),
-    argument(
-        name = "element",
-        description = "Element to search for position in the array."
-    )
+    argument(name = "element", description = "Element to search for in the 
array.")
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
-pub(super) struct ArrayPositions {
+pub struct ArrayPositions {
     signature: Signature,
     aliases: Vec<String>,
 }
 
+impl Default for ArrayPositions {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ArrayPositions {
     pub fn new() -> Self {
         Self {
@@ -420,11 +420,11 @@ impl ScalarUDFImpl for ArrayPositions {
         Ok(List(Arc::new(Field::new_list_field(UInt64, true))))
     }
 
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        make_scalar_function(array_positions_inner)(&args.args)
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        match try_array_positions_scalar(&args.args)? {
+            Some(result) => Ok(result),
+            None => make_scalar_function(array_positions_inner)(&args.args),
+        }
     }
 
     fn aliases(&self) -> &[String] {
@@ -436,36 +436,70 @@ impl ScalarUDFImpl for ArrayPositions {
     }
 }
 
-fn array_positions_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let [array, element] = take_function_args("array_positions", args)?;
+/// Attempts the scalar-needle fast path for `array_positions`.
+fn try_array_positions_scalar(args: &[ColumnarValue]) -> 
Result<Option<ColumnarValue>> {
+    let [haystack_arg, needle_arg] = take_function_args("array_positions", 
args)?;
+
+    let scalar_needle = match needle_arg {
+        ColumnarValue::Scalar(s) => s,
+        ColumnarValue::Array(_) => return Ok(None),
+    };
+
+    // `not_distinct` doesn't support nested types (List, Struct, etc.),
+    // so fall back to the per-row path for those.
+    if scalar_needle.data_type().is_nested() {
+        return Ok(None);
+    }
+
+    let (num_rows, all_inputs_scalar) = match haystack_arg {
+        ColumnarValue::Array(a) => (a.len(), false),
+        ColumnarValue::Scalar(_) => (1, true),
+    };
 
-    match &array.data_type() {
+    let needle = scalar_needle.to_array_of_size(1)?;
+    let haystack = haystack_arg.to_array(num_rows)?;
+
+    let result = match haystack.data_type() {
         List(_) => {
-            let arr = as_list_array(&array)?;
-            crate::utils::check_datatypes("array_positions", &[arr.values(), 
element])?;
-            general_positions::<i32>(arr, element)
+            let list = as_list_array(&haystack)?;
+            array_positions_scalar::<i32>(list, &needle)
         }
         LargeList(_) => {
-            let arr = as_large_list_array(&array)?;
-            crate::utils::check_datatypes("array_positions", &[arr.values(), 
element])?;
-            general_positions::<i64>(arr, element)
-        }
-        array_type => {
-            exec_err!("array_positions does not support type '{array_type}'.")
+            let list = as_large_list_array(&haystack)?;
+            array_positions_scalar::<i64>(list, &needle)
         }
+        t => exec_err!("array_positions does not support type '{t}'"),
+    }?;
+
+    if all_inputs_scalar {
+        Ok(Some(ColumnarValue::Scalar(ScalarValue::try_from_array(
+            &result, 0,
+        )?)))
+    } else {
+        Ok(Some(ColumnarValue::Array(result)))
     }
 }
 
-fn general_positions<OffsetSize: OffsetSizeTrait>(
-    list_array: &GenericListArray<OffsetSize>,
-    element_array: &ArrayRef,
+fn array_positions_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let [haystack, needle] = take_function_args("array_positions", args)?;
+
+    match &haystack.data_type() {
+        List(_) => general_positions::<i32>(as_list_array(&haystack)?, needle),
+        LargeList(_) => 
general_positions::<i64>(as_large_list_array(&haystack)?, needle),
+        dt => exec_err!("array_positions does not support type '{dt}'"),
+    }
+}
+
+fn general_positions<O: OffsetSizeTrait>(
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
 ) -> Result<ArrayRef> {
-    let mut data = Vec::with_capacity(list_array.len());
+    crate::utils::check_datatypes("array_positions", &[haystack.values(), 
needle])?;
+    let mut data = Vec::with_capacity(haystack.len());
 
-    for (row_index, list_array_row) in list_array.iter().enumerate() {
-        if let Some(list_array_row) = list_array_row {
-            let eq_array =
-                compare_element_to_list(&list_array_row, element_array, 
row_index, true)?;
+    for (row_index, row) in haystack.iter().enumerate() {
+        if let Some(row) = row {
+            let eq_array = compare_element_to_list(&row, needle, row_index, 
true)?;
 
             // Collect `true`s in 1-indexed positions
             let indexes = eq_array
@@ -485,13 +519,88 @@ fn general_positions<OffsetSize: OffsetSizeTrait>(
     ))
 }
 
+/// Fast path for `array_positions` when the needle is scalar.
+///
+/// Performs a single bulk `not_distinct` comparison of the needle against the
+/// entire flat values buffer, then walks the result bitmap using offsets to
+/// collect all per-row match positions.
+fn array_positions_scalar<O: OffsetSizeTrait>(
+    haystack: &GenericListArray<O>,
+    needle: &ArrayRef,
+) -> Result<ArrayRef> {
+    crate::utils::check_datatypes("array_positions", &[haystack.values(), 
needle])?;
+
+    let num_rows = haystack.len();
+    if num_rows == 0 {
+        return Ok(Arc::new(ListArray::try_new(
+            Arc::new(Field::new_list_field(UInt64, true)),
+            OffsetBuffer::new_zeroed(1),
+            Arc::new(UInt64Array::from(Vec::<u64>::new())),
+            None,
+        )?));
+    }
+
+    let needle_datum = Scalar::new(Arc::clone(needle));
+    let validity = haystack.nulls();
+
+    // Only convert the visible portion of the values array. For sliced
+    // ListArrays, values() returns the full underlying array but only
+    // elements between the first and last offset are referenced.
+    let offsets = haystack.offsets();
+    let first_offset = offsets[0].as_usize();
+    let last_offset = offsets[num_rows].as_usize();
+    let visible_values = haystack
+        .values()
+        .slice(first_offset, last_offset - first_offset);
+
+    // `not_distinct` treats NULL=NULL as true, matching the semantics of
+    // `array_positions`.
+    let eq_array = arrow_ord::cmp::not_distinct(&visible_values, 
&needle_datum)?;
+    let eq_bits = eq_array.values();
+
+    let num_matches = eq_bits.count_set_bits();
+    let mut positions: Vec<u64> = Vec::with_capacity(num_matches);
+    let mut result_offsets: Vec<i32> = Vec::with_capacity(num_rows + 1);
+    result_offsets.push(0);
+    let mut matches = eq_bits.set_indices().peekable();
+
+    // Match positions are relative to visible_values (0-based), so
+    // subtract first_offset from each offset when comparing.
+    for i in 0..num_rows {
+        let start = offsets[i].as_usize() - first_offset;
+        let end = offsets[i + 1].as_usize() - first_offset;
+
+        if validity.is_some_and(|v| v.is_null(i)) {
+            // Null row -> null output; advance past matches in range.
+            while matches.peek().is_some_and(|&p| p < end) {
+                matches.next();
+            }
+            result_offsets.push(positions.len() as i32);
+            continue;
+        }
+
+        // Collect all matches in [start, end).
+        while let Some(pos) = matches.next_if(|&p| p < end) {
+            positions.push((pos - start + 1) as u64);
+        }
+        result_offsets.push(positions.len() as i32);
+    }
+
+    debug_assert_eq!(result_offsets.len(), num_rows + 1);
+    Ok(Arc::new(ListArray::try_new(
+        Arc::new(Field::new_list_field(UInt64, true)),
+        OffsetBuffer::new(result_offsets.into()),
+        Arc::new(UInt64Array::from(positions)),
+        validity.cloned(),
+    )?))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
     use arrow::array::AsArray;
     use arrow::datatypes::Int32Type;
     use datafusion_common::config::ConfigOptions;
-    use datafusion_expr::ScalarFunctionArgs;
 
     #[test]
     fn test_array_position_sliced_list() -> Result<()> {
@@ -540,4 +649,112 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_array_positions_sliced_list() -> Result<()> {
+        // [[10, 20, 30], [30, 40, 30], [50, 60, 30], [70, 80, 30]]
+        //   → slice(1,2) → [[30, 40, 30], [50, 60, 30]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(10), Some(20), Some(30)]),
+            Some(vec![Some(30), Some(40), Some(30)]),
+            Some(vec![Some(50), Some(60), Some(30)]),
+            Some(vec![Some(70), Some(80), Some(30)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, 
true));
+        let return_field = Arc::new(Field::new(
+            "return",
+            List(Arc::new(Field::new_list_field(UInt64, true))),
+            true,
+        ));
+
+        let invoke = |needle: i32| -> Result<ArrayRef> {
+            ArrayPositions::new()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Array(Arc::new(sliced.clone())),
+                        
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+                    ],
+                    arg_fields: vec![
+                        Arc::clone(&haystack_field),
+                        Arc::clone(&needle_field),
+                    ],
+                    number_rows: 2,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
+                })?
+                .into_array(2)
+        };
+
+        // Needle 30: appears at positions 1,3 in row 0 ([30,40,30])
+        // and position 3 in row 1 ([50,60,30]).
+        let output = invoke(30)?;
+        let output = output.as_list::<i32>();
+        let row0 = output.value(0);
+        let row0 = row0.as_primitive::<UInt64Type>();
+        assert_eq!(row0.values().as_ref(), &[1, 3]);
+        let row1 = output.value(1);
+        let row1 = row1.as_primitive::<UInt64Type>();
+        assert_eq!(row1.values().as_ref(), &[3]);
+
+        // Needle 10: only in the sliced-away prefix row → empty lists.
+        let output = invoke(10)?;
+        let output = output.as_list::<i32>();
+        assert!(output.value(0).is_empty());
+        assert!(output.value(1).is_empty());
+
+        // Needle 70: only in the sliced-away suffix row → empty lists.
+        let output = invoke(70)?;
+        let output = output.as_list::<i32>();
+        assert!(output.value(0).is_empty());
+        assert!(output.value(1).is_empty());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_array_positions_sliced_list_with_nulls() -> Result<()> {
+        // [[1, 2], null, [3, 1], [4, 5]]  →  slice(1,2)  →  [null, [3, 1]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(1), Some(2)]),
+            None,
+            Some(vec![Some(3), Some(1)]),
+            Some(vec![Some(4), Some(5)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, 
true));
+        let return_field = Arc::new(Field::new(
+            "return",
+            List(Arc::new(Field::new_list_field(UInt64, true))),
+            true,
+        ));
+
+        let output = ArrayPositions::new()
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(Arc::new(sliced)),
+                    ColumnarValue::Scalar(ScalarValue::Int32(Some(1))),
+                ],
+                arg_fields: vec![Arc::clone(&haystack_field), 
Arc::clone(&needle_field)],
+                number_rows: 2,
+                return_field: Arc::clone(&return_field),
+                config_options: Arc::new(ConfigOptions::default()),
+            })?
+            .into_array(2)?;
+
+        let output = output.as_list::<i32>();
+        // Row 0 is null (from the sliced null row).
+        assert!(output.is_null(0));
+        // Row 1 is [3, 1] → needle 1 found at position 2.
+        assert!(!output.is_null(1));
+        let row1 = output.value(1);
+        let row1 = row1.as_primitive::<UInt64Type>();
+        assert_eq!(row1.values().as_ref(), &[2]);
+
+        Ok(())
+    }
 }
diff --git a/datafusion/sqllogictest/test_files/array.slt 
b/datafusion/sqllogictest/test_files/array.slt
index 573938af0c..a0da989990 100644
--- a/datafusion/sqllogictest/test_files/array.slt
+++ b/datafusion/sqllogictest/test_files/array.slt
@@ -4003,6 +4003,12 @@ select array_position(arrow_cast(['a', 'b', 'c', 'b'], 
'FixedSizeList(4, Utf8)')
 
 ## array_positions (aliases: `list_positions`)
 
+# array_positions with empty array
+query ?
+select array_positions(arrow_cast(make_array(), 'List(Int64)'), 1);
+----
+[]
+
 query ?
 select array_positions([1, 2, 3, 4, 5], null);
 ----
diff --git a/docs/source/user-guide/sql/scalar_functions.md 
b/docs/source/user-guide/sql/scalar_functions.md
index 254151c2c2..15ce587880 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -3823,7 +3823,7 @@ array_positions(array, element)
 #### Arguments
 
 - **array**: Array expression. Can be a constant, column, or function, and any 
combination of array operators.
-- **element**: Element to search for position in the array.
+- **element**: Element to search for in the array.
 
 #### Example
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to