Jefffrey commented on code in PR #22315:
URL: https://github.com/apache/datafusion/pull/22315#discussion_r3303749711


##########
datafusion/functions/src/datetime/date_bin.rs:
##########
@@ -575,53 +578,85 @@ fn date_bin_impl(
         return exec_err!("DATE_BIN stride must be non-zero");
     }
 
-    fn stride_map_fn<T: ArrowTimestampType>(
-        origin: i64,
-        stride: i64,
-        stride_fn: BinFunction,
-    ) -> impl Fn(i64) -> Result<i64> {
-        let scale = match T::UNIT {
+    fn timestamp_scale<T: ArrowTimestampType>() -> i64 {
+        match T::UNIT {
             Nanosecond => 1,
             Microsecond => NANOS_PER_MICRO,
             Millisecond => NANOS_PER_MILLI,
             Second => NANOSECONDS,
-        };
-        move |x: i64| match stride_fn(stride, x * scale, origin) {
-            Ok(result) => Ok(result / scale),
-            Err(e) => Err(e),
         }
     }
 
+    fn scale_timestamp_to_nanos(x: i64, scale: i64) -> Result<i64> {

Review Comment:
   Maybe its better to inline this (and just have the error message as the 
abstraction) since forcing a DataFusion error here leads to some quirks like 
needing to convert to arrow (before converting back to datafusion error)



##########
datafusion/functions/src/datetime/date_bin.rs:
##########
@@ -575,53 +578,85 @@ fn date_bin_impl(
         return exec_err!("DATE_BIN stride must be non-zero");
     }
 
-    fn stride_map_fn<T: ArrowTimestampType>(
-        origin: i64,
-        stride: i64,
-        stride_fn: BinFunction,
-    ) -> impl Fn(i64) -> Result<i64> {
-        let scale = match T::UNIT {
+    fn timestamp_scale<T: ArrowTimestampType>() -> i64 {
+        match T::UNIT {
             Nanosecond => 1,
             Microsecond => NANOS_PER_MICRO,
             Millisecond => NANOS_PER_MILLI,
             Second => NANOSECONDS,
-        };
-        move |x: i64| match stride_fn(stride, x * scale, origin) {
-            Ok(result) => Ok(result / scale),
-            Err(e) => Err(e),
         }
     }
 
+    fn scale_timestamp_to_nanos(x: i64, scale: i64) -> Result<i64> {
+        x.checked_mul(scale).ok_or_else(|| {
+            DataFusionError::Execution(format!(
+                "DATE_BIN source timestamp {x} cannot be represented in 
nanoseconds"
+            ))
+        })
+    }
+
     Ok(match array {
         ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(v, tz_opt)) => {
-            let apply_stride_fn =
-                stride_map_fn::<TimestampNanosecondType>(origin, stride, 
stride_fn);
+            let scale = timestamp_scale::<TimestampNanosecondType>();
             ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
-                v.and_then(|val| apply_stride_fn(val).ok()),
+                match *v {
+                    Some(val) => {
+                        let scaled = scale_timestamp_to_nanos(val, scale)?;
+                        match stride_fn(stride, scaled, origin) {
+                            Ok(result) => Some(result / scale),
+                            Err(_) => None,
+                        }

Review Comment:
   I find it a little confusing we surface error for the overflow, but other 
potential errors are masked via null values 🤔 
   
   I raised an issue since I see we are already a bit inconsistent with this:
   
   - https://github.com/apache/datafusion/issues/22528



##########
datafusion/functions/src/datetime/date_bin.rs:
##########
@@ -1066,6 +1101,54 @@ mod tests {
             res.err().unwrap().strip_backtrace(),
             "This feature is not implemented: DATE_BIN only supports literal 
values for the origin argument, not arrays"
         );
+
+        // source: overflow while scaling to nanoseconds
+        args = vec![
+            ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(
+                IntervalMonthDayNano {
+                    months: 0,
+                    days: 0,
+                    nanoseconds: 1,
+                },
+            ))),
+            ColumnarValue::Scalar(ScalarValue::TimestampSecond(Some(i64::MAX), 
None)),
+            ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(0), 
None)),
+        ];
+        let res = invoke_date_bin_with_args(args, 1, return_field);
+        assert_eq!(
+            res.err().unwrap().strip_backtrace(),
+            "Execution error: DATE_BIN source timestamp 9223372036854775807 
cannot be represented in nanoseconds"
+        );
+
+        // source: overflow while scaling to nanoseconds (array path)
+        let input = Arc::new(
+            vec![Some(i64::MAX)]
+                .into_iter()
+                .collect::<TimestampSecondArray>(),
+        );
+        let return_field_second = &Arc::new(Field::new(
+            "f",
+            DataType::Timestamp(TimeUnit::Second, None),
+            true,
+        ));
+        args = vec![
+            ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(
+                IntervalMonthDayNano {
+                    months: 0,

Review Comment:
   Can we have these as slts only? Or is it easier to represent via rust? It 
seems we overlap with the introduced slt case anyway



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to