This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch branch-53
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/branch-53 by this push:
new 519866cd48 [branch-53] perf: Optimize `to_char` to allocate less, fix
NULL handling (#20635) (#20885)
519866cd48 is described below
commit 519866cd48042a5d92c28eade376086a8cbedd2e
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Mar 12 07:07:01 2026 -0400
[branch-53] perf: Optimize `to_char` to allocate less, fix NULL handling
(#20635) (#20885)
- Part of https://github.com/apache/datafusion/issues/19692
- Closes https://github.com/apache/datafusion/issues/20634 on branch-53
This PR:
- Backports https://github.com/apache/datafusion/pull/20635 from
@neilconway to the branch-53 line
Co-authored-by: Neil Conway <[email protected]>
---
datafusion/functions/benches/to_char.rs | 61 +++---
datafusion/functions/src/datetime/to_char.rs | 214 +++++++++------------
.../test_files/datetime/timestamps.slt | 19 +-
3 files changed, 130 insertions(+), 164 deletions(-)
diff --git a/datafusion/functions/benches/to_char.rs
b/datafusion/functions/benches/to_char.rs
index 65f4999d23..4d866570b7 100644
--- a/datafusion/functions/benches/to_char.rs
+++ b/datafusion/functions/benches/to_char.rs
@@ -18,13 +18,12 @@
use std::hint::black_box;
use std::sync::Arc;
-use arrow::array::{ArrayRef, Date32Array, StringArray};
+use arrow::array::{ArrayRef, Date32Array, Date64Array, StringArray};
use arrow::datatypes::{DataType, Field};
use chrono::TimeDelta;
use chrono::prelude::*;
use criterion::{Criterion, criterion_group, criterion_main};
use datafusion_common::ScalarValue;
-use datafusion_common::ScalarValue::TimestampNanosecond;
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::datetime::to_char;
@@ -63,6 +62,26 @@ fn generate_date32_array(rng: &mut ThreadRng) -> Date32Array
{
Date32Array::from(data)
}
+fn generate_date64_array(rng: &mut ThreadRng) -> Date64Array {
+ let start_date = "1970-01-01"
+ .parse::<NaiveDate>()
+ .expect("Date should parse");
+ let end_date = "2050-12-31"
+ .parse::<NaiveDate>()
+ .expect("Date should parse");
+ let mut data: Vec<i64> = Vec::with_capacity(1000);
+ for _ in 0..1000 {
+ let date = pick_date_in_range(rng, start_date, end_date);
+ let millis = date
+ .and_hms_opt(0, 0, 0)
+ .unwrap()
+ .and_utc()
+ .timestamp_millis();
+ data.push(millis);
+ }
+ Date64Array::from(data)
+}
+
const DATE_PATTERNS: [&str; 5] =
["%Y:%m:%d", "%d-%m-%Y", "%d%m%Y", "%Y%m%d", "%Y...%m...%d"];
@@ -155,7 +174,7 @@ fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("to_char_array_datetime_patterns_1000", |b| {
let mut rng = rand::rng();
- let data_arr = generate_date32_array(&mut rng);
+ let data_arr = generate_date64_array(&mut rng);
let batch_len = data_arr.len();
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
let patterns =
ColumnarValue::Array(Arc::new(generate_datetime_pattern_array(
@@ -182,7 +201,7 @@ fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("to_char_array_mixed_patterns_1000", |b| {
let mut rng = rand::rng();
- let data_arr = generate_date32_array(&mut rng);
+ let data_arr = generate_date64_array(&mut rng);
let batch_len = data_arr.len();
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
let patterns =
ColumnarValue::Array(Arc::new(generate_mixed_pattern_array(
@@ -235,7 +254,7 @@ fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("to_char_scalar_datetime_pattern_1000", |b| {
let mut rng = rand::rng();
- let data_arr = generate_date32_array(&mut rng);
+ let data_arr = generate_date64_array(&mut rng);
let batch_len = data_arr.len();
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some(
@@ -259,38 +278,6 @@ fn criterion_benchmark(c: &mut Criterion) {
)
})
});
-
- c.bench_function("to_char_scalar_1000", |b| {
- let mut rng = rand::rng();
- let timestamp = "2026-07-08T09:10:11"
- .parse::<NaiveDateTime>()
- .unwrap()
- .with_nanosecond(56789)
- .unwrap()
- .and_utc()
- .timestamp_nanos_opt()
- .unwrap();
- let data = ColumnarValue::Scalar(TimestampNanosecond(Some(timestamp),
None));
- let pattern =
-
ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng))));
-
- b.iter(|| {
- black_box(
- to_char()
- .invoke_with_args(ScalarFunctionArgs {
- args: vec![data.clone(), pattern.clone()],
- arg_fields: vec![
- Field::new("a", data.data_type(), true).into(),
- Field::new("b", pattern.data_type(), true).into(),
- ],
- number_rows: 1,
- return_field: Field::new("f", DataType::Utf8,
true).into(),
- config_options: Arc::clone(&config_options),
- })
- .expect("to_char should work on valid values"),
- )
- })
- });
}
criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/src/datetime/to_char.rs
b/datafusion/functions/src/datetime/to_char.rs
index 2c6f823545..4ceaac1cc8 100644
--- a/datafusion/functions/src/datetime/to_char.rs
+++ b/datafusion/functions/src/datetime/to_char.rs
@@ -18,15 +18,15 @@
use std::any::Any;
use std::sync::Arc;
+use arrow::array::builder::StringBuilder;
use arrow::array::cast::AsArray;
-use arrow::array::{Array, ArrayRef, StringArray, new_null_array};
+use arrow::array::{Array, ArrayRef};
use arrow::compute::cast;
use arrow::datatypes::DataType;
use arrow::datatypes::DataType::{
Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8,
};
use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
-use arrow::error::ArrowError;
use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions};
use datafusion_common::{Result, ScalarValue, exec_err,
utils::take_function_args};
use datafusion_expr::TypeSignature::Exact;
@@ -143,20 +143,17 @@ impl ScalarUDFImpl for ToCharFunc {
let [date_time, format] = take_function_args(self.name(), &args)?;
match format {
- ColumnarValue::Scalar(ScalarValue::Utf8(None))
- | ColumnarValue::Scalar(ScalarValue::Null) =>
to_char_scalar(date_time, None),
- // constant format
- ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => {
- // invoke to_char_scalar with the known string, without
converting to array
- to_char_scalar(date_time, Some(format))
+ ColumnarValue::Scalar(ScalarValue::Null | ScalarValue::Utf8(None))
=> {
+ Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
}
- ColumnarValue::Array(_) => to_char_array(&args),
- _ => {
- exec_err!(
- "Format for `to_char` must be non-null Utf8, received {}",
- format.data_type()
- )
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(fmt))) => {
+ to_char_scalar(date_time, fmt)
}
+ ColumnarValue::Array(_) => to_char_array(&args),
+ _ => exec_err!(
+ "Format for `to_char` must be non-null Utf8, received {}",
+ format.data_type()
+ ),
}
}
@@ -171,11 +168,8 @@ impl ScalarUDFImpl for ToCharFunc {
fn build_format_options<'a>(
data_type: &DataType,
- format: Option<&'a str>,
-) -> Result<FormatOptions<'a>, Result<ColumnarValue>> {
- let Some(format) = format else {
- return Ok(FormatOptions::new());
- };
+ format: &'a str,
+) -> Result<FormatOptions<'a>> {
let format_options = match data_type {
Date32 => FormatOptions::new()
.with_date_format(Some(format))
@@ -194,144 +188,114 @@ fn build_format_options<'a>(
},
),
other => {
- return Err(exec_err!(
+ return exec_err!(
"to_char only supports date, time, timestamp and duration data
types, received {other:?}"
- ));
+ );
}
};
Ok(format_options)
}
-/// Special version when arg\[1] is a scalar
-fn to_char_scalar(
- expression: &ColumnarValue,
- format: Option<&str>,
-) -> Result<ColumnarValue> {
- // it's possible that the expression is a scalar however because
- // of the implementation in arrow-rs we need to convert it to an array
+/// Formats `expression` using a constant `format` string.
+fn to_char_scalar(expression: &ColumnarValue, format: &str) ->
Result<ColumnarValue> {
+ // ArrayFormatter requires an array, so scalar expressions must be
+ // converted to a 1-element array first.
let data_type = &expression.data_type();
let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_));
- let array = expression.clone().into_array(1)?;
+ let array = expression.to_array(1)?;
- if format.is_none() {
- return if is_scalar_expression {
- Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
- } else {
- Ok(ColumnarValue::Array(new_null_array(&Utf8, array.len())))
- };
- }
+ let format_options = build_format_options(data_type, format)?;
+ let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
- let format_options = match build_format_options(data_type, format) {
- Ok(value) => value,
- Err(value) => return value,
- };
+ // Pad the preallocated capacity a bit because format specifiers often
+ // expand the string (e.g., %Y -> "2026")
+ let fmt_len = format.len() + 10;
+ let mut builder = StringBuilder::with_capacity(array.len(), array.len() *
fmt_len);
- let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
- let formatted: Result<Vec<Option<String>>, ArrowError> = (0..array.len())
- .map(|i| {
- if array.is_null(i) {
- Ok(None)
- } else {
- formatter.value(i).try_to_string().map(Some)
- }
- })
- .collect();
-
- if let Ok(formatted) = formatted {
- if is_scalar_expression {
- Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
- formatted.first().unwrap().clone(),
- )))
+ for i in 0..array.len() {
+ if array.is_null(i) {
+ builder.append_null();
} else {
- Ok(ColumnarValue::Array(
- Arc::new(StringArray::from(formatted)) as ArrayRef
- ))
- }
- } else {
- // if the data type was a Date32, formatting could have failed because
the format string
- // contained datetime specifiers, so we'll retry by casting the date
array as a timestamp array
- if data_type == &Date32 {
- return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
+ // Write directly into the builder's internal buffer, then
+ // commit the value with append_value("").
+ match formatter.value(i).write(&mut builder) {
+ Ok(()) => builder.append_value(""),
+ // Arrow's Date32 formatter only handles date specifiers
+ // (%Y, %m, %d, ...). Format strings with time specifiers
+ // (%H, %M, %S, ...) cause it to fail. When this happens,
+ // we retry by casting to Date64, whose datetime formatter
+ // handles both date and time specifiers (with zero for
+ // the time components).
+ Err(_) if data_type == &Date32 => {
+ return to_char_scalar(&expression.cast_to(&Date64, None)?,
format);
+ }
+ Err(e) => return Err(e.into()),
+ }
}
+ }
- exec_err!("{}", formatted.unwrap_err())
+ let result = builder.finish();
+ if is_scalar_expression {
+ let val = result.is_valid(0).then(|| result.value(0).to_string());
+ Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
+ } else {
+ Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
}
}
fn to_char_array(args: &[ColumnarValue]) -> Result<ColumnarValue> {
let arrays = ColumnarValue::values_to_arrays(args)?;
- let mut results: Vec<Option<String>> = vec![];
+ let data_array = &arrays[0];
let format_array = arrays[1].as_string::<i32>();
- let data_type = arrays[0].data_type();
+ let data_type = data_array.data_type();
- for idx in 0..arrays[0].len() {
- let format = if format_array.is_null(idx) {
- None
- } else {
- Some(format_array.value(idx))
- };
- if format.is_none() {
- results.push(None);
+ // Arbitrary guess for the length of a typical formatted datetime string
+ let fmt_len = 30;
+ let mut builder =
+ StringBuilder::with_capacity(data_array.len(), data_array.len() *
fmt_len);
+ let mut buffer = String::with_capacity(fmt_len);
+
+ for idx in 0..data_array.len() {
+ if format_array.is_null(idx) || data_array.is_null(idx) {
+ builder.append_null();
continue;
}
- let format_options = match build_format_options(data_type, format) {
- Ok(value) => value,
- Err(value) => return value,
- };
- // this isn't ideal but this can't use ValueFormatter as it isn't
independent
- // from ArrayFormatter
- let formatter = ArrayFormatter::try_new(arrays[0].as_ref(),
&format_options)?;
- let result = formatter.value(idx).try_to_string();
- match result {
- Ok(value) => results.push(Some(value)),
- Err(e) => {
- // if the data type was a Date32, formatting could have failed
because the format string
- // contained datetime specifiers, so we'll treat this specific
date element as a timestamp
- if data_type == &Date32 {
- let failed_date_value = arrays[0].slice(idx, 1);
-
- match retry_date_as_timestamp(&failed_date_value,
&format_options) {
- Ok(value) => {
- results.push(Some(value));
- continue;
- }
- Err(e) => {
- return exec_err!("{}", e);
- }
- }
- }
- return exec_err!("{}", e);
+ let format = format_array.value(idx);
+ let format_options = build_format_options(data_type, format)?;
+ let formatter = ArrayFormatter::try_new(data_array.as_ref(),
&format_options)?;
+
+ buffer.clear();
+
+ // We'd prefer to write directly to the StringBuilder's internal
buffer,
+ // but the write might fail, and there's no easy way to ensure a
partial
+ // write is removed from the buffer. So instead we write to a temporary
+ // buffer and `append_value` on success.
+ match formatter.value(idx).write(&mut buffer) {
+ Ok(()) => builder.append_value(&buffer),
+ // Retry with Date64 (see comment in to_char_scalar).
+ Err(_) if data_type == &Date32 => {
+ buffer.clear();
+ let date64_value = cast(&data_array.slice(idx, 1), &Date64)?;
+ let retry_fmt =
+ ArrayFormatter::try_new(date64_value.as_ref(),
&format_options)?;
+ retry_fmt.value(0).write(&mut buffer)?;
+ builder.append_value(&buffer);
}
+ Err(e) => return Err(e.into()),
}
}
+ let result = builder.finish();
match args[0] {
- ColumnarValue::Array(_) =>
Ok(ColumnarValue::Array(Arc::new(StringArray::from(
- results,
- )) as ArrayRef)),
- ColumnarValue::Scalar(_) => match results.first().unwrap() {
- Some(value) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
- value.to_string(),
- )))),
- None => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
- },
+ ColumnarValue::Scalar(_) => {
+ let val = result.is_valid(0).then(|| result.value(0).to_string());
+ Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
+ }
+ ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(result) as
ArrayRef)),
}
}
-fn retry_date_as_timestamp(
- array_ref: &ArrayRef,
- format_options: &FormatOptions,
-) -> Result<String> {
- let target_data_type = Date64;
-
- let date_value = cast(&array_ref, &target_data_type)?;
- let formatter = ArrayFormatter::try_new(date_value.as_ref(),
format_options)?;
- let result = formatter.value(0).try_to_string()?;
-
- Ok(result)
-}
-
#[cfg(test)]
mod tests {
use crate::datetime::to_char::ToCharFunc;
diff --git a/datafusion/sqllogictest/test_files/datetime/timestamps.slt
b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
index 9526ccebfd..c3d36b247b 100644
--- a/datafusion/sqllogictest/test_files/datetime/timestamps.slt
+++ b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
@@ -3676,10 +3676,10 @@ select to_char(arrow_cast(123456, 'Duration(Second)'),
null);
----
NULL
-query error DataFusion error: Execution error: Cast error: Format error
+query error DataFusion error: Arrow error: Cast error: Format error
SELECT to_char(timestamps, '%X%K') from formats;
-query error DataFusion error: Execution error: Cast error: Format error
+query error DataFusion error: Arrow error: Cast error: Format error
SELECT to_char('2000-02-03'::date, '%X%K');
query T
@@ -3726,6 +3726,21 @@ select to_char('2020-01-01 00:10:20.123'::timestamp at
time zone 'America/New_Yo
----
2020-01-01 00:10:20.123
+# Null values with array format
+query T
+SELECT to_char(column1, column2)
+FROM (VALUES
+ (DATE '2020-09-01', '%Y-%m-%d'),
+ (NULL, '%Y-%m-%d'),
+ (DATE '2020-09-02', NULL),
+ (NULL, NULL)
+);
+----
+2020-09-01
+NULL
+NULL
+NULL
+
statement ok
drop table formats;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]