This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch branch-53
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/branch-53 by this push:
     new 519866cd48 [branch-53] perf: Optimize `to_char` to allocate less, fix 
NULL handling (#20635) (#20885)
519866cd48 is described below

commit 519866cd48042a5d92c28eade376086a8cbedd2e
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Mar 12 07:07:01 2026 -0400

    [branch-53] perf: Optimize `to_char` to allocate less, fix NULL handling 
(#20635) (#20885)
    
    - Part of https://github.com/apache/datafusion/issues/19692
    - Closes https://github.com/apache/datafusion/issues/20634 on branch-53
    
    This PR:
    - Backports https://github.com/apache/datafusion/pull/20635 from
    @neilconway to the branch-53 line
    
    Co-authored-by: Neil Conway <[email protected]>
---
 datafusion/functions/benches/to_char.rs            |  61 +++---
 datafusion/functions/src/datetime/to_char.rs       | 214 +++++++++------------
 .../test_files/datetime/timestamps.slt             |  19 +-
 3 files changed, 130 insertions(+), 164 deletions(-)

diff --git a/datafusion/functions/benches/to_char.rs 
b/datafusion/functions/benches/to_char.rs
index 65f4999d23..4d866570b7 100644
--- a/datafusion/functions/benches/to_char.rs
+++ b/datafusion/functions/benches/to_char.rs
@@ -18,13 +18,12 @@
 use std::hint::black_box;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, Date32Array, StringArray};
+use arrow::array::{ArrayRef, Date32Array, Date64Array, StringArray};
 use arrow::datatypes::{DataType, Field};
 use chrono::TimeDelta;
 use chrono::prelude::*;
 use criterion::{Criterion, criterion_group, criterion_main};
 use datafusion_common::ScalarValue;
-use datafusion_common::ScalarValue::TimestampNanosecond;
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use datafusion_functions::datetime::to_char;
@@ -63,6 +62,26 @@ fn generate_date32_array(rng: &mut ThreadRng) -> Date32Array 
{
     Date32Array::from(data)
 }
 
+fn generate_date64_array(rng: &mut ThreadRng) -> Date64Array {
+    let start_date = "1970-01-01"
+        .parse::<NaiveDate>()
+        .expect("Date should parse");
+    let end_date = "2050-12-31"
+        .parse::<NaiveDate>()
+        .expect("Date should parse");
+    let mut data: Vec<i64> = Vec::with_capacity(1000);
+    for _ in 0..1000 {
+        let date = pick_date_in_range(rng, start_date, end_date);
+        let millis = date
+            .and_hms_opt(0, 0, 0)
+            .unwrap()
+            .and_utc()
+            .timestamp_millis();
+        data.push(millis);
+    }
+    Date64Array::from(data)
+}
+
 const DATE_PATTERNS: [&str; 5] =
     ["%Y:%m:%d", "%d-%m-%Y", "%d%m%Y", "%Y%m%d", "%Y...%m...%d"];
 
@@ -155,7 +174,7 @@ fn criterion_benchmark(c: &mut Criterion) {
 
     c.bench_function("to_char_array_datetime_patterns_1000", |b| {
         let mut rng = rand::rng();
-        let data_arr = generate_date32_array(&mut rng);
+        let data_arr = generate_date64_array(&mut rng);
         let batch_len = data_arr.len();
         let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
         let patterns = 
ColumnarValue::Array(Arc::new(generate_datetime_pattern_array(
@@ -182,7 +201,7 @@ fn criterion_benchmark(c: &mut Criterion) {
 
     c.bench_function("to_char_array_mixed_patterns_1000", |b| {
         let mut rng = rand::rng();
-        let data_arr = generate_date32_array(&mut rng);
+        let data_arr = generate_date64_array(&mut rng);
         let batch_len = data_arr.len();
         let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
         let patterns = 
ColumnarValue::Array(Arc::new(generate_mixed_pattern_array(
@@ -235,7 +254,7 @@ fn criterion_benchmark(c: &mut Criterion) {
 
     c.bench_function("to_char_scalar_datetime_pattern_1000", |b| {
         let mut rng = rand::rng();
-        let data_arr = generate_date32_array(&mut rng);
+        let data_arr = generate_date64_array(&mut rng);
         let batch_len = data_arr.len();
         let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
         let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some(
@@ -259,38 +278,6 @@ fn criterion_benchmark(c: &mut Criterion) {
             )
         })
     });
-
-    c.bench_function("to_char_scalar_1000", |b| {
-        let mut rng = rand::rng();
-        let timestamp = "2026-07-08T09:10:11"
-            .parse::<NaiveDateTime>()
-            .unwrap()
-            .with_nanosecond(56789)
-            .unwrap()
-            .and_utc()
-            .timestamp_nanos_opt()
-            .unwrap();
-        let data = ColumnarValue::Scalar(TimestampNanosecond(Some(timestamp), 
None));
-        let pattern =
-            
ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng))));
-
-        b.iter(|| {
-            black_box(
-                to_char()
-                    .invoke_with_args(ScalarFunctionArgs {
-                        args: vec![data.clone(), pattern.clone()],
-                        arg_fields: vec![
-                            Field::new("a", data.data_type(), true).into(),
-                            Field::new("b", pattern.data_type(), true).into(),
-                        ],
-                        number_rows: 1,
-                        return_field: Field::new("f", DataType::Utf8, 
true).into(),
-                        config_options: Arc::clone(&config_options),
-                    })
-                    .expect("to_char should work on valid values"),
-            )
-        })
-    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/src/datetime/to_char.rs 
b/datafusion/functions/src/datetime/to_char.rs
index 2c6f823545..4ceaac1cc8 100644
--- a/datafusion/functions/src/datetime/to_char.rs
+++ b/datafusion/functions/src/datetime/to_char.rs
@@ -18,15 +18,15 @@
 use std::any::Any;
 use std::sync::Arc;
 
+use arrow::array::builder::StringBuilder;
 use arrow::array::cast::AsArray;
-use arrow::array::{Array, ArrayRef, StringArray, new_null_array};
+use arrow::array::{Array, ArrayRef};
 use arrow::compute::cast;
 use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::{
     Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8,
 };
 use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
-use arrow::error::ArrowError;
 use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions};
 use datafusion_common::{Result, ScalarValue, exec_err, 
utils::take_function_args};
 use datafusion_expr::TypeSignature::Exact;
@@ -143,20 +143,17 @@ impl ScalarUDFImpl for ToCharFunc {
         let [date_time, format] = take_function_args(self.name(), &args)?;
 
         match format {
-            ColumnarValue::Scalar(ScalarValue::Utf8(None))
-            | ColumnarValue::Scalar(ScalarValue::Null) => 
to_char_scalar(date_time, None),
-            // constant format
-            ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => {
-                // invoke to_char_scalar with the known string, without 
converting to array
-                to_char_scalar(date_time, Some(format))
+            ColumnarValue::Scalar(ScalarValue::Null | ScalarValue::Utf8(None)) 
=> {
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
             }
-            ColumnarValue::Array(_) => to_char_array(&args),
-            _ => {
-                exec_err!(
-                    "Format for `to_char` must be non-null Utf8, received {}",
-                    format.data_type()
-                )
+            ColumnarValue::Scalar(ScalarValue::Utf8(Some(fmt))) => {
+                to_char_scalar(date_time, fmt)
             }
+            ColumnarValue::Array(_) => to_char_array(&args),
+            _ => exec_err!(
+                "Format for `to_char` must be non-null Utf8, received {}",
+                format.data_type()
+            ),
         }
     }
 
@@ -171,11 +168,8 @@ impl ScalarUDFImpl for ToCharFunc {
 
 fn build_format_options<'a>(
     data_type: &DataType,
-    format: Option<&'a str>,
-) -> Result<FormatOptions<'a>, Result<ColumnarValue>> {
-    let Some(format) = format else {
-        return Ok(FormatOptions::new());
-    };
+    format: &'a str,
+) -> Result<FormatOptions<'a>> {
     let format_options = match data_type {
         Date32 => FormatOptions::new()
             .with_date_format(Some(format))
@@ -194,144 +188,114 @@ fn build_format_options<'a>(
             },
         ),
         other => {
-            return Err(exec_err!(
+            return exec_err!(
                 "to_char only supports date, time, timestamp and duration data 
types, received {other:?}"
-            ));
+            );
         }
     };
     Ok(format_options)
 }
 
-/// Special version when arg\[1] is a scalar
-fn to_char_scalar(
-    expression: &ColumnarValue,
-    format: Option<&str>,
-) -> Result<ColumnarValue> {
-    // it's possible that the expression is a scalar however because
-    // of the implementation in arrow-rs we need to convert it to an array
+/// Formats `expression` using a constant `format` string.
+fn to_char_scalar(expression: &ColumnarValue, format: &str) -> 
Result<ColumnarValue> {
+    // ArrayFormatter requires an array, so scalar expressions must be
+    // converted to a 1-element array first.
     let data_type = &expression.data_type();
     let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_));
-    let array = expression.clone().into_array(1)?;
+    let array = expression.to_array(1)?;
 
-    if format.is_none() {
-        return if is_scalar_expression {
-            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
-        } else {
-            Ok(ColumnarValue::Array(new_null_array(&Utf8, array.len())))
-        };
-    }
+    let format_options = build_format_options(data_type, format)?;
+    let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
 
-    let format_options = match build_format_options(data_type, format) {
-        Ok(value) => value,
-        Err(value) => return value,
-    };
+    // Pad the preallocated capacity a bit because format specifiers often
+    // expand the string (e.g., %Y -> "2026")
+    let fmt_len = format.len() + 10;
+    let mut builder = StringBuilder::with_capacity(array.len(), array.len() * 
fmt_len);
 
-    let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
-    let formatted: Result<Vec<Option<String>>, ArrowError> = (0..array.len())
-        .map(|i| {
-            if array.is_null(i) {
-                Ok(None)
-            } else {
-                formatter.value(i).try_to_string().map(Some)
-            }
-        })
-        .collect();
-
-    if let Ok(formatted) = formatted {
-        if is_scalar_expression {
-            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
-                formatted.first().unwrap().clone(),
-            )))
+    for i in 0..array.len() {
+        if array.is_null(i) {
+            builder.append_null();
         } else {
-            Ok(ColumnarValue::Array(
-                Arc::new(StringArray::from(formatted)) as ArrayRef
-            ))
-        }
-    } else {
-        // if the data type was a Date32, formatting could have failed because 
the format string
-        // contained datetime specifiers, so we'll retry by casting the date 
array as a timestamp array
-        if data_type == &Date32 {
-            return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
+            // Write directly into the builder's internal buffer, then
+            // commit the value with append_value("").
+            match formatter.value(i).write(&mut builder) {
+                Ok(()) => builder.append_value(""),
+                // Arrow's Date32 formatter only handles date specifiers
+                // (%Y, %m, %d, ...). Format strings with time specifiers
+                // (%H, %M, %S, ...) cause it to fail. When this happens,
+                // we retry by casting to Date64, whose datetime formatter
+                // handles both date and time specifiers (with zero for
+                // the time components).
+                Err(_) if data_type == &Date32 => {
+                    return to_char_scalar(&expression.cast_to(&Date64, None)?, 
format);
+                }
+                Err(e) => return Err(e.into()),
+            }
         }
+    }
 
-        exec_err!("{}", formatted.unwrap_err())
+    let result = builder.finish();
+    if is_scalar_expression {
+        let val = result.is_valid(0).then(|| result.value(0).to_string());
+        Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
+    } else {
+        Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
     }
 }
 
 fn to_char_array(args: &[ColumnarValue]) -> Result<ColumnarValue> {
     let arrays = ColumnarValue::values_to_arrays(args)?;
-    let mut results: Vec<Option<String>> = vec![];
+    let data_array = &arrays[0];
     let format_array = arrays[1].as_string::<i32>();
-    let data_type = arrays[0].data_type();
+    let data_type = data_array.data_type();
 
-    for idx in 0..arrays[0].len() {
-        let format = if format_array.is_null(idx) {
-            None
-        } else {
-            Some(format_array.value(idx))
-        };
-        if format.is_none() {
-            results.push(None);
+    // Arbitrary guess for the length of a typical formatted datetime string
+    let fmt_len = 30;
+    let mut builder =
+        StringBuilder::with_capacity(data_array.len(), data_array.len() * 
fmt_len);
+    let mut buffer = String::with_capacity(fmt_len);
+
+    for idx in 0..data_array.len() {
+        if format_array.is_null(idx) || data_array.is_null(idx) {
+            builder.append_null();
             continue;
         }
-        let format_options = match build_format_options(data_type, format) {
-            Ok(value) => value,
-            Err(value) => return value,
-        };
-        // this isn't ideal but this can't use ValueFormatter as it isn't 
independent
-        // from ArrayFormatter
-        let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), 
&format_options)?;
-        let result = formatter.value(idx).try_to_string();
-        match result {
-            Ok(value) => results.push(Some(value)),
-            Err(e) => {
-                // if the data type was a Date32, formatting could have failed 
because the format string
-                // contained datetime specifiers, so we'll treat this specific 
date element as a timestamp
-                if data_type == &Date32 {
-                    let failed_date_value = arrays[0].slice(idx, 1);
-
-                    match retry_date_as_timestamp(&failed_date_value, 
&format_options) {
-                        Ok(value) => {
-                            results.push(Some(value));
-                            continue;
-                        }
-                        Err(e) => {
-                            return exec_err!("{}", e);
-                        }
-                    }
-                }
 
-                return exec_err!("{}", e);
+        let format = format_array.value(idx);
+        let format_options = build_format_options(data_type, format)?;
+        let formatter = ArrayFormatter::try_new(data_array.as_ref(), 
&format_options)?;
+
+        buffer.clear();
+
+        // We'd prefer to write directly to the StringBuilder's internal 
buffer,
+        // but the write might fail, and there's no easy way to ensure a 
partial
+        // write is removed from the buffer. So instead we write to a temporary
+        // buffer and `append_value` on success.
+        match formatter.value(idx).write(&mut buffer) {
+            Ok(()) => builder.append_value(&buffer),
+            // Retry with Date64 (see comment in to_char_scalar).
+            Err(_) if data_type == &Date32 => {
+                buffer.clear();
+                let date64_value = cast(&data_array.slice(idx, 1), &Date64)?;
+                let retry_fmt =
+                    ArrayFormatter::try_new(date64_value.as_ref(), 
&format_options)?;
+                retry_fmt.value(0).write(&mut buffer)?;
+                builder.append_value(&buffer);
             }
+            Err(e) => return Err(e.into()),
         }
     }
 
+    let result = builder.finish();
     match args[0] {
-        ColumnarValue::Array(_) => 
Ok(ColumnarValue::Array(Arc::new(StringArray::from(
-            results,
-        )) as ArrayRef)),
-        ColumnarValue::Scalar(_) => match results.first().unwrap() {
-            Some(value) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
-                value.to_string(),
-            )))),
-            None => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
-        },
+        ColumnarValue::Scalar(_) => {
+            let val = result.is_valid(0).then(|| result.value(0).to_string());
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
+        }
+        ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(result) as 
ArrayRef)),
     }
 }
 
-fn retry_date_as_timestamp(
-    array_ref: &ArrayRef,
-    format_options: &FormatOptions,
-) -> Result<String> {
-    let target_data_type = Date64;
-
-    let date_value = cast(&array_ref, &target_data_type)?;
-    let formatter = ArrayFormatter::try_new(date_value.as_ref(), 
format_options)?;
-    let result = formatter.value(0).try_to_string()?;
-
-    Ok(result)
-}
-
 #[cfg(test)]
 mod tests {
     use crate::datetime::to_char::ToCharFunc;
diff --git a/datafusion/sqllogictest/test_files/datetime/timestamps.slt 
b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
index 9526ccebfd..c3d36b247b 100644
--- a/datafusion/sqllogictest/test_files/datetime/timestamps.slt
+++ b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
@@ -3676,10 +3676,10 @@ select to_char(arrow_cast(123456, 'Duration(Second)'), 
null);
 ----
 NULL
 
-query error DataFusion error: Execution error: Cast error: Format error
+query error DataFusion error: Arrow error: Cast error: Format error
 SELECT to_char(timestamps, '%X%K') from formats;
 
-query error DataFusion error: Execution error: Cast error: Format error
+query error DataFusion error: Arrow error: Cast error: Format error
 SELECT to_char('2000-02-03'::date, '%X%K');
 
 query T
@@ -3726,6 +3726,21 @@ select to_char('2020-01-01 00:10:20.123'::timestamp at 
time zone 'America/New_Yo
 ----
 2020-01-01 00:10:20.123
 
+# Null values with array format
+query T
+SELECT to_char(column1, column2)
+FROM (VALUES
+    (DATE '2020-09-01', '%Y-%m-%d'),
+    (NULL, '%Y-%m-%d'),
+    (DATE '2020-09-02', NULL),
+    (NULL, NULL)
+);
+----
+2020-09-01
+NULL
+NULL
+NULL
+
 statement ok
 drop table formats;
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to