This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new b6b542e87b perf: Optimize `array_positions()` for scalar needle
(#20770)
b6b542e87b is described below
commit b6b542e87b84f4744096106bea0de755b2e70cc5
Author: Neil Conway <[email protected]>
AuthorDate: Wed Mar 18 07:35:32 2026 -0400
perf: Optimize `array_positions()` for scalar needle (#20770)
## Which issue does this PR close?
- Closes #20769.
## Rationale for this change
`array_positions` previously compared the needle against each row's
sub-array individually. When the needle is a scalar (the common case),
we can do a single bulk `arrow_ord::cmp::not_distinct` comparison
against the entire flat values buffer and then walk the result bitmap,
which is significantly faster: the speedup on the `array_positions()`
microbenchmarks ranges from 5x to 40x, depending on the size of the
array.
The same pattern has already been applied to `array_position` (#20532),
and previously to other array UDFs.
## What changes are included in this PR?
- Add benchmarks for `array_positions`.
- Implement bulk-comparison optimization
- Refactor `array_position`'s existing fast path slightly for
consistency
- Code cleanup to use "haystack" and "needle" consistently, not vague
terms like "list_array" and "element"
- Add unit tests for `array_positions` with sliced ListArrays, for peace
of mind
- Add unit tests for sliced lists and sliced lists with nulls for the
new `array_positions` fast path.
## Are these changes tested?
Yes.
## Are there any user-facing changes?
No.
## AI usage
Multiple AI tools were used to iterate on this PR. I have reviewed and
understand the resulting code.
---------
Co-authored-by: Oleks V <[email protected]>
---
.../functions-nested/benches/array_position.rs | 109 ++++-
datafusion/functions-nested/src/position.rs | 473 +++++++++++++++------
datafusion/sqllogictest/test_files/array.slt | 6 +
docs/source/user-guide/sql/scalar_functions.md | 2 +-
4 files changed, 460 insertions(+), 130 deletions(-)
diff --git a/datafusion/functions-nested/benches/array_position.rs
b/datafusion/functions-nested/benches/array_position.rs
index 0836764844..c718b2b725 100644
--- a/datafusion/functions-nested/benches/array_position.rs
+++ b/datafusion/functions-nested/benches/array_position.rs
@@ -24,7 +24,7 @@ use criterion::{
use datafusion_common::ScalarValue;
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
-use datafusion_functions_nested::position::ArrayPosition;
+use datafusion_functions_nested::position::{ArrayPosition, ArrayPositions};
use rand::Rng;
use rand::SeedableRng;
use rand::rngs::StdRng;
@@ -39,6 +39,7 @@ const SENTINEL_NEEDLE: i64 = -1;
fn criterion_benchmark(c: &mut Criterion) {
for size in [10, 100, 500] {
bench_array_position(c, size);
+ bench_array_positions(c, size);
}
}
@@ -146,6 +147,112 @@ fn bench_array_position(c: &mut Criterion, array_size:
usize) {
group.finish();
}
+fn bench_array_positions(c: &mut Criterion, array_size: usize) {
+ let mut group = c.benchmark_group("array_positions_i64");
+ let haystack_found_once = create_haystack_with_sentinel(
+ NUM_ROWS,
+ array_size,
+ NULL_DENSITY,
+ SENTINEL_NEEDLE,
+ 0,
+ );
+ let haystack_found_many = create_haystack_with_sentinels(
+ NUM_ROWS,
+ array_size,
+ NULL_DENSITY,
+ SENTINEL_NEEDLE,
+ );
+ let haystack_not_found =
+ create_haystack_without_sentinel(NUM_ROWS, array_size, NULL_DENSITY);
+ let num_rows = haystack_not_found.len();
+ let arg_fields: Vec<Arc<Field>> = vec![
+ Field::new("haystack", haystack_not_found.data_type().clone(),
false).into(),
+ Field::new("needle", DataType::Int64, false).into(),
+ ];
+ let return_field: Arc<Field> = Field::new(
+ "result",
+ DataType::List(Arc::new(Field::new_list_field(DataType::UInt64,
true))),
+ true,
+ )
+ .into();
+ let config_options = Arc::new(ConfigOptions::default());
+ let needle = ScalarValue::Int64(Some(SENTINEL_NEEDLE));
+
+ let args_found_once = vec![
+ ColumnarValue::Array(haystack_found_once.clone()),
+ ColumnarValue::Scalar(needle.clone()),
+ ];
+ group.bench_with_input(
+ BenchmarkId::new("found_once", array_size),
+ &array_size,
+ |b, _| {
+ let udf = ArrayPositions::new();
+ b.iter(|| {
+ black_box(
+ udf.invoke_with_args(ScalarFunctionArgs {
+ args: args_found_once.clone(),
+ arg_fields: arg_fields.clone(),
+ number_rows: num_rows,
+ return_field: return_field.clone(),
+ config_options: config_options.clone(),
+ })
+ .unwrap(),
+ )
+ })
+ },
+ );
+
+ let args_found_many = vec![
+ ColumnarValue::Array(haystack_found_many.clone()),
+ ColumnarValue::Scalar(needle.clone()),
+ ];
+ group.bench_with_input(
+ BenchmarkId::new("found_many", array_size),
+ &array_size,
+ |b, _| {
+ let udf = ArrayPositions::new();
+ b.iter(|| {
+ black_box(
+ udf.invoke_with_args(ScalarFunctionArgs {
+ args: args_found_many.clone(),
+ arg_fields: arg_fields.clone(),
+ number_rows: num_rows,
+ return_field: return_field.clone(),
+ config_options: config_options.clone(),
+ })
+ .unwrap(),
+ )
+ })
+ },
+ );
+
+ let args_not_found = vec![
+ ColumnarValue::Array(haystack_not_found.clone()),
+ ColumnarValue::Scalar(needle.clone()),
+ ];
+ group.bench_with_input(
+ BenchmarkId::new("not_found", array_size),
+ &array_size,
+ |b, _| {
+ let udf = ArrayPositions::new();
+ b.iter(|| {
+ black_box(
+ udf.invoke_with_args(ScalarFunctionArgs {
+ args: args_not_found.clone(),
+ arg_fields: arg_fields.clone(),
+ number_rows: num_rows,
+ return_field: return_field.clone(),
+ config_options: config_options.clone(),
+ })
+ .unwrap(),
+ )
+ })
+ },
+ );
+
+ group.finish();
+}
+
fn create_haystack_without_sentinel(
num_rows: usize,
array_size: usize,
diff --git a/datafusion/functions-nested/src/position.rs
b/datafusion/functions-nested/src/position.rs
index 0214b1552b..acdeb202f9 100644
--- a/datafusion/functions-nested/src/position.rs
+++ b/datafusion/functions-nested/src/position.rs
@@ -18,6 +18,7 @@
//! [`ScalarUDFImpl`] definitions for array_position and array_positions
functions.
use arrow::array::Scalar;
+use arrow::buffer::OffsetBuffer;
use arrow::datatypes::DataType;
use arrow::datatypes::{
DataType::{LargeList, List, UInt64},
@@ -25,7 +26,8 @@ use arrow::datatypes::{
};
use datafusion_common::ScalarValue;
use datafusion_expr::{
- ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
+ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+ Volatility,
};
use datafusion_macros::user_doc;
@@ -122,57 +124,10 @@ impl ScalarUDFImpl for ArrayPosition {
Ok(UInt64)
}
- fn invoke_with_args(
- &self,
- args: datafusion_expr::ScalarFunctionArgs,
- ) -> Result<ColumnarValue> {
- let [first_arg, second_arg, third_arg @ ..] = args.args.as_slice()
else {
- return exec_err!("array_position expects two or three arguments");
- };
-
- match second_arg {
- ColumnarValue::Scalar(scalar_element) => {
- // Nested element types (List, Struct) can't use the fast path
- // (because Arrow's `non_distinct` does not support them).
- if scalar_element.data_type().is_nested() {
- return
make_scalar_function(array_position_inner)(&args.args);
- }
-
- // Determine batch length from whichever argument is columnar;
- // if all inputs are scalar, batch length is 1.
- let (num_rows, all_inputs_scalar) = match (first_arg,
third_arg.first()) {
- (ColumnarValue::Array(a), _) => (a.len(), false),
- (_, Some(ColumnarValue::Array(a))) => (a.len(), false),
- _ => (1, true),
- };
-
- let element_arr = scalar_element.to_array_of_size(1)?;
- let haystack = first_arg.to_array(num_rows)?;
- let arr_from = resolve_start_from(third_arg.first(),
num_rows)?;
-
- let result = match haystack.data_type() {
- List(_) => {
- let list = as_generic_list_array::<i32>(&haystack)?;
- array_position_scalar::<i32>(list, &element_arr,
&arr_from)
- }
- LargeList(_) => {
- let list = as_generic_list_array::<i64>(&haystack)?;
- array_position_scalar::<i64>(list, &element_arr,
&arr_from)
- }
- t => exec_err!("array_position does not support type
'{t}'."),
- }?;
-
- if all_inputs_scalar {
- Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
- &result, 0,
- )?))
- } else {
- Ok(ColumnarValue::Array(result))
- }
- }
- ColumnarValue::Array(_) => {
- make_scalar_function(array_position_inner)(&args.args)
- }
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ match try_array_position_scalar(&args.args)? {
+ Some(result) => Ok(result),
+ None => make_scalar_function(array_position_inner)(&args.args),
}
}
@@ -185,6 +140,57 @@ impl ScalarUDFImpl for ArrayPosition {
}
}
+/// Attempts the scalar-needle fast path for `array_position`.
+fn try_array_position_scalar(args: &[ColumnarValue]) ->
Result<Option<ColumnarValue>> {
+ if args.len() < 2 || args.len() > 3 {
+ return exec_err!("array_position expects two or three arguments");
+ }
+
+ // Fallback to the generic code path if the needle is an array
+ let scalar_needle = match &args[1] {
+ ColumnarValue::Scalar(s) => s,
+ ColumnarValue::Array(_) => return Ok(None),
+ };
+
+ // `not_distinct` doesn't support nested types (List, Struct, etc.),
+ // so fall back to the generic code path for those.
+ if scalar_needle.data_type().is_nested() {
+ return Ok(None);
+ }
+
+ // Determine batch length from whichever argument is columnar;
+ // if all inputs are scalar, batch length is 1.
+ let (num_rows, all_inputs_scalar) = match (&args[0], args.get(2)) {
+ (ColumnarValue::Array(a), _) => (a.len(), false),
+ (_, Some(ColumnarValue::Array(a))) => (a.len(), false),
+ _ => (1, true),
+ };
+
+ let needle = scalar_needle.to_array_of_size(1)?;
+ let haystack = args[0].to_array(num_rows)?;
+ let arr_from = resolve_start_from(args.get(2), num_rows)?;
+
+ let result = match haystack.data_type() {
+ List(_) => {
+ let list = as_list_array(&haystack)?;
+ array_position_scalar::<i32>(list, &needle, &arr_from)
+ }
+ LargeList(_) => {
+ let list = as_large_list_array(&haystack)?;
+ array_position_scalar::<i64>(list, &needle, &arr_from)
+ }
+ t => exec_err!("array_position does not support type '{t}'"),
+ }?;
+
+ if all_inputs_scalar {
+ Ok(Some(ColumnarValue::Scalar(ScalarValue::try_from_array(
+ &result, 0,
+ )?)))
+ } else {
+ Ok(Some(ColumnarValue::Array(result)))
+ }
+}
+
fn array_position_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
if args.len() < 2 || args.len() > 3 {
return exec_err!("array_position expects two or three arguments");
@@ -192,7 +198,7 @@ fn array_position_inner(args: &[ArrayRef]) ->
Result<ArrayRef> {
match &args[0].data_type() {
List(_) => general_position_dispatch::<i32>(args),
LargeList(_) => general_position_dispatch::<i64>(args),
- array_type => exec_err!("array_position does not support type
'{array_type}'."),
+ dt => exec_err!("array_position does not support type '{dt}'"),
}
}
@@ -216,48 +222,46 @@ fn resolve_start_from(
}
}
-/// Fast path for `array_position` when the element is a scalar.
+/// Fast path for `array_position` when the needle is scalar.
///
-/// Performs a single bulk `not_distinct` comparison of the scalar element
-/// against the entire flattened values buffer, then walks the result bitmap
-/// using offsets to find per-row first-match positions.
+/// Performs a single bulk `not_distinct` comparison of the needle against the
+/// entire flat values buffer, then walks the result bitmap using offsets to
+/// find per-row first-match positions.
fn array_position_scalar<O: OffsetSizeTrait>(
- list_array: &GenericListArray<O>,
- element_array: &ArrayRef,
+ haystack: &GenericListArray<O>,
+ needle: &ArrayRef,
arr_from: &[i64], // 0-indexed
) -> Result<ArrayRef> {
- crate::utils::check_datatypes(
- "array_position",
- &[list_array.values(), element_array],
- )?;
+ crate::utils::check_datatypes("array_position", &[haystack.values(),
needle])?;
- if list_array.len() == 0 {
+ if haystack.len() == 0 {
return Ok(Arc::new(UInt64Array::new_null(0)));
}
- let element_datum = Scalar::new(Arc::clone(element_array));
- let validity = list_array.nulls();
+ let needle_datum = Scalar::new(Arc::clone(needle));
+ let validity = haystack.nulls();
- // Only compare the visible portion of the values buffer, which avoids
- // wasted work for sliced ListArrays.
- let offsets = list_array.offsets();
+ // Only convert the visible portion of the values array. For sliced
+ // ListArrays, values() returns the full underlying array but only
+ // elements between the first and last offset are referenced.
+ let offsets = haystack.offsets();
let first_offset = offsets[0].as_usize();
- let last_offset = offsets[list_array.len()].as_usize();
- let visible_values = list_array
+ let last_offset = offsets[haystack.len()].as_usize();
+ let visible_values = haystack
.values()
.slice(first_offset, last_offset - first_offset);
// `not_distinct` treats NULL=NULL as true, matching the semantics of
- // `array_position`
- let eq_array = arrow_ord::cmp::not_distinct(&visible_values,
&element_datum)?;
+ // `array_position`.
+ let eq_array = arrow_ord::cmp::not_distinct(&visible_values,
&needle_datum)?;
let eq_bits = eq_array.values();
- let mut result: Vec<Option<u64>> = Vec::with_capacity(list_array.len());
+ let mut result: Vec<Option<u64>> = Vec::with_capacity(haystack.len());
let mut matches = eq_bits.set_indices().peekable();
// Match positions are relative to visible_values (0-based), so
// subtract first_offset from each offset when comparing.
- for i in 0..list_array.len() {
+ for i in 0..haystack.len() {
let start = offsets[i].as_usize() - first_offset;
let end = offsets[i + 1].as_usize() - first_offset;
@@ -295,18 +299,15 @@ fn array_position_scalar<O: OffsetSizeTrait>(
}
}
- debug_assert_eq!(result.len(), list_array.len());
+ debug_assert_eq!(result.len(), haystack.len());
Ok(Arc::new(UInt64Array::from(result)))
}
fn general_position_dispatch<O: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
- let list_array = as_generic_list_array::<O>(&args[0])?;
- let element_array = &args[1];
+ let haystack = as_generic_list_array::<O>(&args[0])?;
+ let needle = &args[1];
- crate::utils::check_datatypes(
- "array_position",
- &[list_array.values(), element_array],
- )?;
+ crate::utils::check_datatypes("array_position", &[haystack.values(),
needle])?;
let arr_from = if args.len() == 3 {
as_int64_array(&args[2])?
@@ -315,34 +316,30 @@ fn general_position_dispatch<O: OffsetSizeTrait>(args:
&[ArrayRef]) -> Result<Ar
.map(|&x| x - 1)
.collect::<Vec<_>>()
} else {
- vec![0; list_array.len()]
+ vec![0; haystack.len()]
};
- for (arr, &from) in list_array.iter().zip(arr_from.iter()) {
- // If `arr` is `None`: we will get null if we got null in the array,
so we don't need to check
- if !arr.is_none_or(|arr| from >= 0 && (from as usize) <= arr.len()) {
+ for (row, &from) in haystack.iter().zip(arr_from.iter()) {
+ if !row.is_none_or(|row| from >= 0 && (from as usize) <= row.len()) {
return exec_err!("start_from out of bounds: {}", from + 1);
}
}
- generic_position::<O>(list_array, element_array, &arr_from)
+ generic_position::<O>(haystack, needle, &arr_from)
}
-fn generic_position<OffsetSize: OffsetSizeTrait>(
- list_array: &GenericListArray<OffsetSize>,
- element_array: &ArrayRef,
+fn generic_position<O: OffsetSizeTrait>(
+ haystack: &GenericListArray<O>,
+ needle: &ArrayRef,
arr_from: &[i64], // 0-indexed
) -> Result<ArrayRef> {
- let mut data = Vec::with_capacity(list_array.len());
+ let mut data = Vec::with_capacity(haystack.len());
- for (row_index, (list_array_row, &from)) in
- list_array.iter().zip(arr_from.iter()).enumerate()
- {
+ for (row_index, (row, &from)) in
haystack.iter().zip(arr_from.iter()).enumerate() {
let from = from as usize;
- if let Some(list_array_row) = list_array_row {
- let eq_array =
- compare_element_to_list(&list_array_row, element_array,
row_index, true)?;
+ if let Some(row) = row {
+ let eq_array = compare_element_to_list(&row, needle, row_index,
true)?;
// Collect `true`s in 1-indexed positions
let index = eq_array
@@ -384,17 +381,20 @@ make_udf_expr_and_func!(
name = "array",
description = "Array expression. Can be a constant, column, or
function, and any combination of array operators."
),
- argument(
- name = "element",
- description = "Element to search for position in the array."
- )
+ argument(name = "element", description = "Element to search for in the
array.")
)]
#[derive(Debug, PartialEq, Eq, Hash)]
-pub(super) struct ArrayPositions {
+pub struct ArrayPositions {
signature: Signature,
aliases: Vec<String>,
}
+impl Default for ArrayPositions {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
impl ArrayPositions {
pub fn new() -> Self {
Self {
@@ -420,11 +420,11 @@ impl ScalarUDFImpl for ArrayPositions {
Ok(List(Arc::new(Field::new_list_field(UInt64, true))))
}
- fn invoke_with_args(
- &self,
- args: datafusion_expr::ScalarFunctionArgs,
- ) -> Result<ColumnarValue> {
- make_scalar_function(array_positions_inner)(&args.args)
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ match try_array_positions_scalar(&args.args)? {
+ Some(result) => Ok(result),
+ None => make_scalar_function(array_positions_inner)(&args.args),
+ }
}
fn aliases(&self) -> &[String] {
@@ -436,36 +436,70 @@ impl ScalarUDFImpl for ArrayPositions {
}
}
-fn array_positions_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
- let [array, element] = take_function_args("array_positions", args)?;
+/// Attempts the scalar-needle fast path for `array_positions`.
+fn try_array_positions_scalar(args: &[ColumnarValue]) ->
Result<Option<ColumnarValue>> {
+ let [haystack_arg, needle_arg] = take_function_args("array_positions",
args)?;
+
+ let scalar_needle = match needle_arg {
+ ColumnarValue::Scalar(s) => s,
+ ColumnarValue::Array(_) => return Ok(None),
+ };
+
+ // `not_distinct` doesn't support nested types (List, Struct, etc.),
+ // so fall back to the per-row path for those.
+ if scalar_needle.data_type().is_nested() {
+ return Ok(None);
+ }
+
+ let (num_rows, all_inputs_scalar) = match haystack_arg {
+ ColumnarValue::Array(a) => (a.len(), false),
+ ColumnarValue::Scalar(_) => (1, true),
+ };
- match &array.data_type() {
+ let needle = scalar_needle.to_array_of_size(1)?;
+ let haystack = haystack_arg.to_array(num_rows)?;
+
+ let result = match haystack.data_type() {
List(_) => {
- let arr = as_list_array(&array)?;
- crate::utils::check_datatypes("array_positions", &[arr.values(),
element])?;
- general_positions::<i32>(arr, element)
+ let list = as_list_array(&haystack)?;
+ array_positions_scalar::<i32>(list, &needle)
}
LargeList(_) => {
- let arr = as_large_list_array(&array)?;
- crate::utils::check_datatypes("array_positions", &[arr.values(),
element])?;
- general_positions::<i64>(arr, element)
- }
- array_type => {
- exec_err!("array_positions does not support type '{array_type}'.")
+ let list = as_large_list_array(&haystack)?;
+ array_positions_scalar::<i64>(list, &needle)
}
+ t => exec_err!("array_positions does not support type '{t}'"),
+ }?;
+
+ if all_inputs_scalar {
+ Ok(Some(ColumnarValue::Scalar(ScalarValue::try_from_array(
+ &result, 0,
+ )?)))
+ } else {
+ Ok(Some(ColumnarValue::Array(result)))
}
}
-fn general_positions<OffsetSize: OffsetSizeTrait>(
- list_array: &GenericListArray<OffsetSize>,
- element_array: &ArrayRef,
+fn array_positions_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let [haystack, needle] = take_function_args("array_positions", args)?;
+
+ match &haystack.data_type() {
+ List(_) => general_positions::<i32>(as_list_array(&haystack)?, needle),
+ LargeList(_) =>
general_positions::<i64>(as_large_list_array(&haystack)?, needle),
+ dt => exec_err!("array_positions does not support type '{dt}'"),
+ }
+}
+
+fn general_positions<O: OffsetSizeTrait>(
+ haystack: &GenericListArray<O>,
+ needle: &ArrayRef,
) -> Result<ArrayRef> {
- let mut data = Vec::with_capacity(list_array.len());
+ crate::utils::check_datatypes("array_positions", &[haystack.values(),
needle])?;
+ let mut data = Vec::with_capacity(haystack.len());
- for (row_index, list_array_row) in list_array.iter().enumerate() {
- if let Some(list_array_row) = list_array_row {
- let eq_array =
- compare_element_to_list(&list_array_row, element_array,
row_index, true)?;
+ for (row_index, row) in haystack.iter().enumerate() {
+ if let Some(row) = row {
+ let eq_array = compare_element_to_list(&row, needle, row_index,
true)?;
// Collect `true`s in 1-indexed positions
let indexes = eq_array
@@ -485,13 +519,88 @@ fn general_positions<OffsetSize: OffsetSizeTrait>(
))
}
+/// Fast path for `array_positions` when the needle is scalar.
+///
+/// Performs a single bulk `not_distinct` comparison of the needle against the
+/// entire flat values buffer, then walks the result bitmap using offsets to
+/// collect all per-row match positions.
+fn array_positions_scalar<O: OffsetSizeTrait>(
+ haystack: &GenericListArray<O>,
+ needle: &ArrayRef,
+) -> Result<ArrayRef> {
+ crate::utils::check_datatypes("array_positions", &[haystack.values(),
needle])?;
+
+ let num_rows = haystack.len();
+ if num_rows == 0 {
+ return Ok(Arc::new(ListArray::try_new(
+ Arc::new(Field::new_list_field(UInt64, true)),
+ OffsetBuffer::new_zeroed(1),
+ Arc::new(UInt64Array::from(Vec::<u64>::new())),
+ None,
+ )?));
+ }
+
+ let needle_datum = Scalar::new(Arc::clone(needle));
+ let validity = haystack.nulls();
+
+ // Only convert the visible portion of the values array. For sliced
+ // ListArrays, values() returns the full underlying array but only
+ // elements between the first and last offset are referenced.
+ let offsets = haystack.offsets();
+ let first_offset = offsets[0].as_usize();
+ let last_offset = offsets[num_rows].as_usize();
+ let visible_values = haystack
+ .values()
+ .slice(first_offset, last_offset - first_offset);
+
+ // `not_distinct` treats NULL=NULL as true, matching the semantics of
+ // `array_positions`.
+ let eq_array = arrow_ord::cmp::not_distinct(&visible_values,
&needle_datum)?;
+ let eq_bits = eq_array.values();
+
+ let num_matches = eq_bits.count_set_bits();
+ let mut positions: Vec<u64> = Vec::with_capacity(num_matches);
+ let mut result_offsets: Vec<i32> = Vec::with_capacity(num_rows + 1);
+ result_offsets.push(0);
+ let mut matches = eq_bits.set_indices().peekable();
+
+ // Match positions are relative to visible_values (0-based), so
+ // subtract first_offset from each offset when comparing.
+ for i in 0..num_rows {
+ let start = offsets[i].as_usize() - first_offset;
+ let end = offsets[i + 1].as_usize() - first_offset;
+
+ if validity.is_some_and(|v| v.is_null(i)) {
+ // Null row -> null output; advance past matches in range.
+ while matches.peek().is_some_and(|&p| p < end) {
+ matches.next();
+ }
+ result_offsets.push(positions.len() as i32);
+ continue;
+ }
+
+ // Collect all matches in [start, end).
+ while let Some(pos) = matches.next_if(|&p| p < end) {
+ positions.push((pos - start + 1) as u64);
+ }
+ result_offsets.push(positions.len() as i32);
+ }
+
+ debug_assert_eq!(result_offsets.len(), num_rows + 1);
+ Ok(Arc::new(ListArray::try_new(
+ Arc::new(Field::new_list_field(UInt64, true)),
+ OffsetBuffer::new(result_offsets.into()),
+ Arc::new(UInt64Array::from(positions)),
+ validity.cloned(),
+ )?))
+}
+
#[cfg(test)]
mod tests {
use super::*;
use arrow::array::AsArray;
use arrow::datatypes::Int32Type;
use datafusion_common::config::ConfigOptions;
- use datafusion_expr::ScalarFunctionArgs;
#[test]
fn test_array_position_sliced_list() -> Result<()> {
@@ -540,4 +649,112 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn test_array_positions_sliced_list() -> Result<()> {
+ // [[10, 20, 30], [30, 40, 30], [50, 60, 30], [70, 80, 30]]
+ // → slice(1,2) → [[30, 40, 30], [50, 60, 30]]
+ let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+ Some(vec![Some(10), Some(20), Some(30)]),
+ Some(vec![Some(30), Some(40), Some(30)]),
+ Some(vec![Some(50), Some(60), Some(30)]),
+ Some(vec![Some(70), Some(80), Some(30)]),
+ ]);
+ let sliced = list.slice(1, 2);
+ let haystack_field =
+ Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+ let needle_field = Arc::new(Field::new("needle", DataType::Int32,
true));
+ let return_field = Arc::new(Field::new(
+ "return",
+ List(Arc::new(Field::new_list_field(UInt64, true))),
+ true,
+ ));
+
+ let invoke = |needle: i32| -> Result<ArrayRef> {
+ ArrayPositions::new()
+ .invoke_with_args(ScalarFunctionArgs {
+ args: vec![
+ ColumnarValue::Array(Arc::new(sliced.clone())),
+
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+ ],
+ arg_fields: vec![
+ Arc::clone(&haystack_field),
+ Arc::clone(&needle_field),
+ ],
+ number_rows: 2,
+ return_field: Arc::clone(&return_field),
+ config_options: Arc::new(ConfigOptions::default()),
+ })?
+ .into_array(2)
+ };
+
+ // Needle 30: appears at positions 1,3 in row 0 ([30,40,30])
+ // and position 3 in row 1 ([50,60,30]).
+ let output = invoke(30)?;
+ let output = output.as_list::<i32>();
+ let row0 = output.value(0);
+ let row0 = row0.as_primitive::<UInt64Type>();
+ assert_eq!(row0.values().as_ref(), &[1, 3]);
+ let row1 = output.value(1);
+ let row1 = row1.as_primitive::<UInt64Type>();
+ assert_eq!(row1.values().as_ref(), &[3]);
+
+ // Needle 10: only in the sliced-away prefix row → empty lists.
+ let output = invoke(10)?;
+ let output = output.as_list::<i32>();
+ assert!(output.value(0).is_empty());
+ assert!(output.value(1).is_empty());
+
+ // Needle 70: only in the sliced-away suffix row → empty lists.
+ let output = invoke(70)?;
+ let output = output.as_list::<i32>();
+ assert!(output.value(0).is_empty());
+ assert!(output.value(1).is_empty());
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_array_positions_sliced_list_with_nulls() -> Result<()> {
+ // [[1, 2], null, [3, 1], [4, 5]] → slice(1,2) → [null, [3, 1]]
+ let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+ Some(vec![Some(1), Some(2)]),
+ None,
+ Some(vec![Some(3), Some(1)]),
+ Some(vec![Some(4), Some(5)]),
+ ]);
+ let sliced = list.slice(1, 2);
+ let haystack_field =
+ Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+ let needle_field = Arc::new(Field::new("needle", DataType::Int32,
true));
+ let return_field = Arc::new(Field::new(
+ "return",
+ List(Arc::new(Field::new_list_field(UInt64, true))),
+ true,
+ ));
+
+ let output = ArrayPositions::new()
+ .invoke_with_args(ScalarFunctionArgs {
+ args: vec![
+ ColumnarValue::Array(Arc::new(sliced)),
+ ColumnarValue::Scalar(ScalarValue::Int32(Some(1))),
+ ],
+ arg_fields: vec![Arc::clone(&haystack_field),
Arc::clone(&needle_field)],
+ number_rows: 2,
+ return_field: Arc::clone(&return_field),
+ config_options: Arc::new(ConfigOptions::default()),
+ })?
+ .into_array(2)?;
+
+ let output = output.as_list::<i32>();
+ // Row 0 is null (from the sliced null row).
+ assert!(output.is_null(0));
+ // Row 1 is [3, 1] → needle 1 found at position 2.
+ assert!(!output.is_null(1));
+ let row1 = output.value(1);
+ let row1 = row1.as_primitive::<UInt64Type>();
+ assert_eq!(row1.values().as_ref(), &[2]);
+
+ Ok(())
+ }
}
diff --git a/datafusion/sqllogictest/test_files/array.slt
b/datafusion/sqllogictest/test_files/array.slt
index 573938af0c..a0da989990 100644
--- a/datafusion/sqllogictest/test_files/array.slt
+++ b/datafusion/sqllogictest/test_files/array.slt
@@ -4003,6 +4003,12 @@ select array_position(arrow_cast(['a', 'b', 'c', 'b'],
'FixedSizeList(4, Utf8)')
## array_positions (aliases: `list_positions`)
+# array_positions with empty array
+query ?
+select array_positions(arrow_cast(make_array(), 'List(Int64)'), 1);
+----
+[]
+
query ?
select array_positions([1, 2, 3, 4, 5], null);
----
diff --git a/docs/source/user-guide/sql/scalar_functions.md
b/docs/source/user-guide/sql/scalar_functions.md
index 254151c2c2..15ce587880 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -3823,7 +3823,7 @@ array_positions(array, element)
#### Arguments
- **array**: Array expression. Can be a constant, column, or function, and any
combination of array operators.
-- **element**: Element to search for position in the array.
+- **element**: Element to search for in the array.
#### Example
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]