Jefffrey commented on code in PR #21834:
URL: https://github.com/apache/datafusion/pull/21834#discussion_r3139349585
##########
datafusion/functions-nested/benches/array_remove.rs:
##########
@@ -16,74 +16,61 @@
// under the License.
use arrow::array::{
- Array, ArrayRef, BinaryArray, BooleanArray, Decimal128Array,
FixedSizeBinaryArray,
- Float64Array, Int64Array, ListArray, StringArray,
+ Array, ArrayBuilder, ArrayRef, BooleanBuilder, FixedSizeBinaryArray,
Int64Builder,
+ ListArray, ListBuilder, StringBuilder,
};
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
use arrow::datatypes::{DataType, Field};
use criterion::{
criterion_group, criterion_main, {BenchmarkId, Criterion},
};
use datafusion_common::ScalarValue;
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
-use datafusion_functions_nested::remove::ArrayRemove;
+use datafusion_functions_nested::remove::{ArrayRemove, ArrayRemoveAll,
ArrayRemoveN};
use rand::Rng;
use rand::SeedableRng;
use rand::rngs::StdRng;
+use rand::seq::IndexedRandom;
use std::hint::black_box;
use std::sync::Arc;
const NUM_ROWS: usize = 10000;
-const ARRAY_SIZES: &[usize] = &[10, 100, 500];
+const LIST_SIZES: &[usize] = &[10, 100, 500];
const SEED: u64 = 42;
-const NULL_DENSITY: f64 = 0.1;
+const HAYSTACK_NULL_DENSITY: f64 = 0.1;
+const NEEDLE_DENSITY: f64 = 0.1;
Review Comment:
`NULL_DENSITY` previously determined the nulls within the lists inside list
arrays; the list array itself always had no null buffer so each element was
valid. Changing the way we generate test data (see next comments)
##########
datafusion/functions-nested/src/remove.rs:
##########
@@ -163,11 +163,17 @@ make_udf_expr_and_func!(
argument(name = "max", description = "Number of first occurrences to
remove.")
)]
#[derive(Debug, PartialEq, Eq, Hash)]
-pub(super) struct ArrayRemoveN {
+pub struct ArrayRemoveN {
Review Comment:
To make them accessible in benchmarks
##########
datafusion/functions-nested/benches/array_remove.rs:
##########
@@ -341,231 +350,145 @@ fn bench_array_remove_fixed_size_binary(c: &mut
Criterion) {
group.finish();
}
-fn create_args(list_array: ArrayRef, element: ScalarValue) ->
Vec<ColumnarValue> {
- vec![
- ColumnarValue::Array(list_array),
- ColumnarValue::Scalar(element),
- ]
+#[inline]
+fn create_args(haystack: ArrayRef, needle: ScalarValue) -> ScalarFunctionArgs {
+ let haystack_type = haystack.data_type().clone();
+ let needle_type = needle.data_type().clone();
+ ScalarFunctionArgs {
+ args: vec![
+ ColumnarValue::Array(haystack),
+ ColumnarValue::Scalar(needle),
+ ],
+ arg_fields: vec![
+ Field::new("haystack", haystack_type.clone(), true).into(),
+ Field::new("needle", needle_type, true).into(),
+ ],
+ number_rows: NUM_ROWS,
+ return_field: Field::new("result", haystack_type, true).into(),
+ config_options: Arc::new(ConfigOptions::default()),
+ }
}
-fn create_int64_list_array(
- num_rows: usize,
- array_size: usize,
- null_density: f64,
-) -> ArrayRef {
- let mut rng = StdRng::seed_from_u64(SEED);
- let values = (0..num_rows * array_size)
- .map(|_| {
- if rng.random::<f64>() < null_density {
- None
- } else {
- Some(rng.random_range(0..array_size as i64))
- }
- })
- .collect::<Int64Array>();
- let offsets = (0..=num_rows)
- .map(|i| (i * array_size) as i32)
- .collect::<Vec<i32>>();
-
- Arc::new(
- ListArray::try_new(
- Arc::new(Field::new("item", DataType::Int64, true)),
- OffsetBuffer::new(offsets.into()),
- Arc::new(values),
- None,
- )
- .unwrap(),
- )
+#[inline]
+fn create_args_n(
+ haystack: ArrayRef,
+ needle: ScalarValue,
+ n: ScalarValue,
+) -> ScalarFunctionArgs {
+ let haystack_type = haystack.data_type().clone();
+ let needle_type = needle.data_type().clone();
+ let n_type = n.data_type().clone();
+ ScalarFunctionArgs {
+ args: vec![
+ ColumnarValue::Array(haystack),
+ ColumnarValue::Scalar(needle),
+ ColumnarValue::Scalar(n),
+ ],
+ arg_fields: vec![
+ Field::new("haystack", haystack_type.clone(), true).into(),
+ Field::new("needle", needle_type, true).into(),
+ Field::new("n", n_type, true).into(),
+ ],
+ number_rows: NUM_ROWS,
+ return_field: Field::new("result", haystack_type, true).into(),
+ config_options: Arc::new(ConfigOptions::default()),
+ }
}
-fn create_f64_list_array(
- num_rows: usize,
- array_size: usize,
- null_density: f64,
-) -> ArrayRef {
+fn create_list_array<Builder, Item>(
Review Comment:
This is a generic data generator for types which work with
`ListArray::from_nested_iter`
- Unfortunately `FixedSizeBinaryBuilder` doesn't satisfy all the required
bounds so fixed size binary array needs to be a separate function (same with
nested list)
Previously we generated quite randomly:
1. For each row in the haystack array:
2. Generate the values of the fixed sized list element, where values may be
null, and values in range `0..list_size`
3. Collect these list elements into final haystack array which has no nulls
Some problems with this approach is we don't control the presence of the
element to remove (the needle).
- For example in fixed size binary generation, it generated random 16 bytes
for each value, and the needle was a fixed `[1_u8; 16]`; that is quite a search
area for the needle
This new approach uses `NEEDLE_DENSITY` to determine within each row of the
haystack array how many of these to place, otherwise filling with some other
filler elements (which aren't really important; we don't need a widespread of
these values, just values that *aren't* the needle). We then also add nulls to
the final haystack array.
##########
datafusion/functions-nested/benches/array_remove.rs:
##########
@@ -16,74 +16,61 @@
// under the License.
use arrow::array::{
- Array, ArrayRef, BinaryArray, BooleanArray, Decimal128Array,
FixedSizeBinaryArray,
- Float64Array, Int64Array, ListArray, StringArray,
+ Array, ArrayBuilder, ArrayRef, BooleanBuilder, FixedSizeBinaryArray,
Int64Builder,
+ ListArray, ListBuilder, StringBuilder,
};
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
use arrow::datatypes::{DataType, Field};
use criterion::{
criterion_group, criterion_main, {BenchmarkId, Criterion},
};
use datafusion_common::ScalarValue;
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
-use datafusion_functions_nested::remove::ArrayRemove;
+use datafusion_functions_nested::remove::{ArrayRemove, ArrayRemoveAll,
ArrayRemoveN};
use rand::Rng;
use rand::SeedableRng;
use rand::rngs::StdRng;
+use rand::seq::IndexedRandom;
use std::hint::black_box;
use std::sync::Arc;
const NUM_ROWS: usize = 10000;
-const ARRAY_SIZES: &[usize] = &[10, 100, 500];
+const LIST_SIZES: &[usize] = &[10, 100, 500];
const SEED: u64 = 42;
-const NULL_DENSITY: f64 = 0.1;
+const HAYSTACK_NULL_DENSITY: f64 = 0.1;
+const NEEDLE_DENSITY: f64 = 0.1;
fn criterion_benchmark(c: &mut Criterion) {
- // Test array_remove with different data types and array sizes
- // TODO: Add performance tests for nested datatypes
bench_array_remove_int64(c);
- bench_array_remove_f64(c);
+ bench_array_remove_n_int64(c);
+ bench_array_remove_all_int64(c);
+
+ bench_array_remove_int64_nested(c);
+ bench_array_remove_n_int64_nested(c);
+ bench_array_remove_all_int64_nested(c);
+
bench_array_remove_strings(c);
- bench_array_remove_binary(c);
bench_array_remove_boolean(c);
- bench_array_remove_decimal64(c);
Review Comment:
Decided to remove decimal & f64 as it seems redundant to have them as i64
should be enough to test primitive types; same reason for removing binary since
we can just use string benchmark
##########
datafusion/functions-nested/benches/array_remove.rs:
##########
@@ -92,39 +79,29 @@ fn bench_array_remove_int64(c: &mut Criterion) {
group.finish();
}
-fn bench_array_remove_f64(c: &mut Criterion) {
- let mut group = c.benchmark_group("array_remove_f64");
+fn bench_array_remove_n_int64(c: &mut Criterion) {
+ let mut group = c.benchmark_group("array_remove_n_int64");
- for &array_size in ARRAY_SIZES {
- let list_array = create_f64_list_array(NUM_ROWS, array_size,
NULL_DENSITY);
- let element_to_remove = ScalarValue::Float64(Some(1.0));
- let args = create_args(list_array.clone(), element_to_remove.clone());
+ let filler_values = [None, Some(1), Some(2), Some(3), Some(4), Some(5)];
+ let needle = 0;
+ for &list_size in LIST_SIZES {
+ let list_array =
+ create_list_array::<Int64Builder, _>(list_size, needle,
&filler_values);
+ let n = (NEEDLE_DENSITY / 2.0 * list_size as f64) as i64;
+ let n = 2.max(n);
Review Comment:
For `array_remove_n`, roughly try to remove half of the average amount of
needles in each list in the haystack (capping it at 2 minimum to differ from
`array_remove`)
##########
datafusion/functions-nested/benches/array_remove.rs:
##########
@@ -16,74 +16,61 @@
// under the License.
use arrow::array::{
- Array, ArrayRef, BinaryArray, BooleanArray, Decimal128Array,
FixedSizeBinaryArray,
- Float64Array, Int64Array, ListArray, StringArray,
+ Array, ArrayBuilder, ArrayRef, BooleanBuilder, FixedSizeBinaryArray,
Int64Builder,
+ ListArray, ListBuilder, StringBuilder,
};
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
use arrow::datatypes::{DataType, Field};
use criterion::{
criterion_group, criterion_main, {BenchmarkId, Criterion},
};
use datafusion_common::ScalarValue;
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
-use datafusion_functions_nested::remove::ArrayRemove;
+use datafusion_functions_nested::remove::{ArrayRemove, ArrayRemoveAll,
ArrayRemoveN};
use rand::Rng;
use rand::SeedableRng;
use rand::rngs::StdRng;
+use rand::seq::IndexedRandom;
use std::hint::black_box;
use std::sync::Arc;
const NUM_ROWS: usize = 10000;
-const ARRAY_SIZES: &[usize] = &[10, 100, 500];
+const LIST_SIZES: &[usize] = &[10, 100, 500];
const SEED: u64 = 42;
-const NULL_DENSITY: f64 = 0.1;
+const HAYSTACK_NULL_DENSITY: f64 = 0.1;
+const NEEDLE_DENSITY: f64 = 0.1;
fn criterion_benchmark(c: &mut Criterion) {
- // Test array_remove with different data types and array sizes
- // TODO: Add performance tests for nested datatypes
bench_array_remove_int64(c);
- bench_array_remove_f64(c);
+ bench_array_remove_n_int64(c);
+ bench_array_remove_all_int64(c);
+
+ bench_array_remove_int64_nested(c);
+ bench_array_remove_n_int64_nested(c);
+ bench_array_remove_all_int64_nested(c);
Review Comment:
We didn't benchmark `array_remove_n` and `array_remove_all` before so adding
them here; same for nested types
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]