This is an automated email from the ASF dual-hosted git repository.

rok pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new ad5f15576e GH-48625: [Python] Add temporal unit checking in 
NumPyDtypeUnifier (#48626)
ad5f15576e is described below

commit ad5f15576e08a095394ebd2a1812dac4180ffe4d
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Thu Jan 8 04:32:12 2026 +0900

    GH-48625: [Python] Add temporal unit checking in NumPyDtypeUnifier (#48626)
    
    ### Rationale for this change
    
    This is to address a todo:
    
    
https://github.com/apache/arrow/blob/de6eb89dbdcf210802c3aad5d3f1a3d4c64c3582/python/pyarrow/src/arrow/python/inference.cc#L258
    
    When users mix `numpy.datetime64` values with different units (e.g., 
`datetime64[s]` and `datetime64[ms]`) in a single array, PyArrow previously 
produced a confusing error message
    
    ### What changes are included in this PR?
    
    - Added datetime64 unit validation in 
`NumPyDtypeUnifier::Observe_DATETIME()`
    - Added `InvalidDatetimeUnitMix()` method
    - Updated `NumPyDtypeUnifier::Observe()` to check units for same-type 
comparisons
    - Updated existing test 
`test_array_from_different_numpy_datetime_units_raises`
    - Removed the TODO comment (now implemented)
    
    ### Are these changes tested?
    
    Manually tested, and unittests were added.
    
    ### Are there any user-facing changes?
    
    Yes. It produces a better error message. For example,
    
    ```python
    import pyarrow as pa
    import numpy as np
    pa.array([np.datetime64('2020-01-01', 's'), np.datetime64('2020-01-02', 
'ms')])
    ```
    
    Before:
    
    ```
    pyarrow.lib.ArrowNotImplementedError: Expected np.datetime64 but got: 
timestamp[ms]
    ```
    
    After:
    
    ```
    pyarrow.lib.ArrowInvalid: Cannot mix NumPy datetime64 units s and ms
    ```
    * GitHub Issue: #48625
    
    Authored-by: Hyukjin Kwon <[email protected]>
    Signed-off-by: Rok Mihevc <[email protected]>
---
 python/pyarrow/src/arrow/python/inference.cc | 65 +++++++++++++++++++++++++++-
 python/pyarrow/tests/test_array.py           | 30 +++++++++++--
 2 files changed, 91 insertions(+), 4 deletions(-)

diff --git a/python/pyarrow/src/arrow/python/inference.cc 
b/python/pyarrow/src/arrow/python/inference.cc
index 1aa7915ba1..e5714862e4 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -108,6 +108,17 @@ class NumPyDtypeUnifier {
                            GetNumPyTypeName(new_dtype));
   }
 
+  Status InvalidDatetimeUnitMix(PyArray_Descr* new_descr) {
+    auto new_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+        PyDataType_C_METADATA(new_descr));
+    auto current_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+        PyDataType_C_METADATA(current_dtype_));
+
+    return Status::Invalid("Cannot mix NumPy datetime64 units ",
+                           DatetimeUnitName(current_meta->meta.base), " and ",
+                           DatetimeUnitName(new_meta->meta.base));
+  }
+
   int Observe_BOOL(PyArray_Descr* descr, int dtype) { return INVALID; }
 
   int Observe_INT8(PyArray_Descr* descr, int dtype) {
@@ -255,7 +266,17 @@ class NumPyDtypeUnifier {
   }
 
   int Observe_DATETIME(PyArray_Descr* dtype_obj) {
-    // TODO: check that units are all the same
+    // Check that datetime units are consistent across all values
+    auto datetime_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+        PyDataType_C_METADATA(dtype_obj));
+    auto current_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+        PyDataType_C_METADATA(current_dtype_));
+
+    if (datetime_meta->meta.base != current_meta->meta.base) {
+      // Units don't match - this is invalid
+      return INVALID;
+    }
+
     return OK;
   }
 
@@ -267,6 +288,13 @@ class NumPyDtypeUnifier {
       current_type_num_ = dtype;
       return Status::OK();
     } else if (current_type_num_ == dtype) {
+      // Same type, but for datetime we still need to check units match
+      if (dtype == NPY_DATETIME) {
+        int action = Observe_DATETIME(descr);
+        if (action == INVALID) {
+          return InvalidDatetimeUnitMix(descr);
+        }
+      }
       return Status::OK();
     }
 
@@ -309,6 +337,41 @@ class NumPyDtypeUnifier {
   int current_type_num() const { return current_type_num_; }
 
  private:
+  static std::string DatetimeUnitName(NPY_DATETIMEUNIT unit) {
+    switch (unit) {
+      case NPY_FR_Y:
+        return "Y";
+      case NPY_FR_M:
+        return "M";
+      case NPY_FR_W:
+        return "W";
+      case NPY_FR_D:
+        return "D";
+      case NPY_FR_h:
+        return "h";
+      case NPY_FR_m:
+        return "m";
+      case NPY_FR_s:
+        return "s";
+      case NPY_FR_ms:
+        return "ms";
+      case NPY_FR_us:
+        return "us";
+      case NPY_FR_ns:
+        return "ns";
+      case NPY_FR_ps:
+        return "ps";
+      case NPY_FR_fs:
+        return "fs";
+      case NPY_FR_as:
+        return "as";
+      case NPY_FR_GENERIC:
+        return "generic";
+      default:
+        return "unknown (" + std::to_string(static_cast<int>(unit)) + ")";
+    }
+  }
+
   int current_type_num_;
   PyArray_Descr* current_dtype_;
 };
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index cefa2de161..d09d9f45c7 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -2541,7 +2541,31 @@ def 
test_array_from_different_numpy_datetime_units_raises():
     ms = np.array(data, dtype='datetime64[ms]')
     data = list(s[:2]) + list(ms[2:])
 
-    with pytest.raises(pa.ArrowNotImplementedError):
+    with pytest.raises(pa.ArrowInvalid,
+                       match="Cannot mix NumPy datetime64 units s and ms"):
+        pa.array(data)
+
+
[email protected]
[email protected]('unit', [
+    'Y',  # year
+    'M',  # month
+    'W',  # week
+    'h',  # hour
+    'm',  # minute
+    'ps',  # picosecond
+    'fs',  # femtosecond
+    'as',  # attosecond
+])
+def test_array_from_unsupported_numpy_datetime_unit_names(unit):
+    s_data = [np.datetime64('2020-01-01', 's')]
+    unsupported_data = [np.datetime64('2020', unit)]
+
+    # Mix supported unit (s) with unsupported unit
+    data = s_data + unsupported_data
+
+    with pytest.raises(pa.ArrowInvalid,
+                       match=f"Cannot mix NumPy datetime64 units s and 
{unit}"):
         pa.array(data)
 
 
@@ -2566,8 +2590,8 @@ def test_array_from_timestamp_with_generic_unit():
     x = np.datetime64('2017-01-01 01:01:01.111111111')
     y = np.datetime64('2018-11-22 12:24:48.111111111')
 
-    with pytest.raises(pa.ArrowNotImplementedError,
-                       match='Unbound or generic datetime64 time unit'):
+    with pytest.raises(pa.ArrowInvalid,
+                       match='Cannot mix NumPy datetime64 units'):
         pa.array([n, x, y])
 
 

Reply via email to