This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new b3e261ddf7 GH-46771: [Python][C++] Implement pa.arange function to 
generate array sequences (#46778)
b3e261ddf7 is described below

commit b3e261ddf7efdce458f0f2dd903c89c78a956c51
Author: Raúl Cumplido <[email protected]>
AuthorDate: Fri Jun 20 12:33:27 2025 +0200

    GH-46771: [Python][C++] Implement pa.arange function to generate array 
sequences (#46778)
    
    ### Rationale for this change
    
    When slicing arrays with non-trivial steps we were using `numpy.arange` to 
generate the indices for take. As numpy is an optional dependency, implementing 
it via Python caused a performance penalty. Creating a pyarrow function to 
build our own ranges that mimics Python range or numpy arange is useful for 
that uses case and might also be useful for other use cases. Currently we only 
generate `Array[Int64]` we could potentially generate more types.
    
    ### What changes are included in this PR?
    
    provide a `pa.arange` function that allows us to generate indices when 
slicing arrays.
    
    ### Are these changes tested?
    
    Yes new tests added.
    
    ### Are there any user-facing changes?
    
    No but a new pyarrow.arange function has been added.
    
    * GitHub Issue: #46771
    
    Authored-by: Raúl Cumplido <[email protected]>
    Signed-off-by: Raúl Cumplido <[email protected]>
---
 python/CMakeLists.txt                       |  3 +-
 python/pyarrow/__init__.py                  |  1 +
 python/pyarrow/array.pxi                    | 41 ++++++++++++++++++++---
 python/pyarrow/includes/libarrow_python.pxd |  3 ++
 python/pyarrow/src/arrow/python/api.h       |  1 +
 python/pyarrow/src/arrow/python/util.cc     | 50 +++++++++++++++++++++++++++++
 python/pyarrow/src/arrow/python/util.h      | 40 +++++++++++++++++++++++
 python/pyarrow/tests/test_array.py          | 26 +++++++++++++++
 8 files changed, 159 insertions(+), 6 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 4138d2b282..8b938fb641 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -368,7 +368,8 @@ set(PYARROW_CPP_SRCS
     ${PYARROW_CPP_SOURCE_DIR}/python_test.cc
     ${PYARROW_CPP_SOURCE_DIR}/python_to_arrow.cc
     ${PYARROW_CPP_SOURCE_DIR}/pyarrow.cc
-    ${PYARROW_CPP_SOURCE_DIR}/udf.cc)
+    ${PYARROW_CPP_SOURCE_DIR}/udf.cc
+    ${PYARROW_CPP_SOURCE_DIR}/util.cc)
 set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc
                             PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
 
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index c732c13764..3555fa4f31 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -191,6 +191,7 @@ from pyarrow.lib import (null, bool_,
                          SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix,
                          SparseCSFTensor,
                          infer_type, from_numpy_dtype,
+                         arange,
                          NullArray,
                          NumericArray, IntegerArray, FloatingPointArray,
                          BooleanArray,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 99b9f3762d..874e242511 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -572,22 +572,53 @@ def infer_type(values, mask=None, from_pandas=False):
     return pyarrow_wrap_data_type(out)
 
 
+def arange(int64_t start, int64_t stop, int64_t step=1, *, memory_pool=None):
+    """
+    Create an array of evenly spaced values within a given interval.
+
+    This function is similar to Python's `range` function.
+    The resulting array will contain values starting from `start` up to but not
+    including `stop`, with a step size of `step`.
+
+    Parameters
+    ----------
+    start : int
+        The starting value for the sequence. The returned array will include 
this value.
+    stop : int
+        The stopping value for the sequence. The returned array will not 
include this value.
+    step : int, default 1
+        The spacing between values.
+    memory_pool : MemoryPool, optional
+        A memory pool to use for memory allocations.
+
+    Raises
+    ------
+    ArrowInvalid
+        If `step` is zero.
+
+    Returns
+    -------
+    arange : Array
+    """
+    cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
+    with nogil:
+        c_array = GetResultValue(Arange(start, stop, step, pool))
+    return pyarrow_wrap_array(c_array)
+
+
 def _normalize_slice(object arrow_obj, slice key):
     """
     Slices with step not equal to 1 (or None) will produce a copy
     rather than a zero-copy view
     """
     cdef:
-        Py_ssize_t start, stop, step
+        int64_t start, stop, step
         Py_ssize_t n = len(arrow_obj)
 
     start, stop, step = key.indices(n)
 
     if step != 1:
-        indices = list(range(start, stop, step))
-        if len(indices) == 0:
-            return arrow_obj.slice(0, 0)
-        return arrow_obj.take(indices)
+        return arrow_obj.take(arange(start, stop, step))
     else:
         length = max(stop - start, 0)
         return arrow_obj.slice(start, length)
diff --git a/python/pyarrow/includes/libarrow_python.pxd 
b/python/pyarrow/includes/libarrow_python.pxd
index e544aa0165..bf90c13926 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -73,6 +73,9 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" 
nogil:
         object obj, object mask, const PyConversionOptions& options,
         CMemoryPool* pool)
 
+    CResult[shared_ptr[CArray]] Arange(int64_t start, int64_t stop,
+                                       int64_t step, CMemoryPool* pool)
+
     CResult[shared_ptr[CDataType]] NumPyDtypeToArrow(object dtype)
 
     CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
diff --git a/python/pyarrow/src/arrow/python/api.h 
b/python/pyarrow/src/arrow/python/api.h
index e66bf49dfe..2af0963a9c 100644
--- a/python/pyarrow/src/arrow/python/api.h
+++ b/python/pyarrow/src/arrow/python/api.h
@@ -26,3 +26,4 @@
 #include "arrow/python/numpy_convert.h"
 #include "arrow/python/numpy_to_arrow.h"
 #include "arrow/python/python_to_arrow.h"
+#include "arrow/python/util.h"
diff --git a/python/pyarrow/src/arrow/python/util.cc 
b/python/pyarrow/src/arrow/python/util.cc
new file mode 100644
index 0000000000..cffe1eb956
--- /dev/null
+++ b/python/pyarrow/src/arrow/python/util.cc
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/util.h"
+
+#include "arrow/array.h"
+#include "arrow/python/common.h"
+
+namespace arrow ::py {
+
+Result<std::shared_ptr<Array>> Arange(int64_t start, int64_t stop, int64_t 
step,
+                                      MemoryPool* pool) {
+  int64_t size;
+  if (step == 0) {
+    return Status::Invalid("Step must not be zero");
+  }
+  if (step > 0 && stop > start) {
+    // Ceiling division for positive step
+    size = (stop - start + step - 1) / step;
+  } else if (step < 0 && stop < start) {
+    // Ceiling division for negative step
+    size = (start - stop - step - 1) / (-step);
+  } else {
+    return MakeEmptyArray(int64());
+  }
+  std::shared_ptr<Buffer> data_buffer;
+  ARROW_ASSIGN_OR_RAISE(data_buffer, AllocateBuffer(size * sizeof(int64_t), 
pool));
+  auto values = reinterpret_cast<int64_t*>(data_buffer->mutable_data());
+  for (int64_t i = 0; i < size; ++i) {
+    values[i] = start + i * step;
+  }
+  auto data = ArrayData::Make(int64(), size, {nullptr, data_buffer}, 0);
+  return MakeArray(data);
+}
+
+}  // namespace arrow::py
diff --git a/python/pyarrow/src/arrow/python/util.h 
b/python/pyarrow/src/arrow/python/util.h
new file mode 100644
index 0000000000..ff2ffcaea9
--- /dev/null
+++ b/python/pyarrow/src/arrow/python/util.h
@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow::py {
+
+/// \brief Create an array of evenly spaced values within a given interval.
+/// This function is similar to Python's `range` function.
+/// The resulting array will contain values starting from `start` up to but not
+/// including `stop`, with a step size of `step`. If `step` is zero, the 
function
+/// will return an error.
+/// The resulting array will have a data type of `int64`.
+/// \param[in] start initial value of the sequence.
+/// \param[in] stop final value of the sequence (exclusive).
+/// \param[in] step step size between consecutive values.
+/// \param[in] pool Memory pool for any memory allocations.
+/// \return Result Array
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<Array>> Arange(int64_t start, int64_t stop, int64_t 
step,
+                                      MemoryPool* pool);
+
+}  // namespace arrow::py
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index 7dabb8396b..97425df0f9 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -536,6 +536,32 @@ def test_array_slice_negative_step():
         assert result.equals(expected)
 
 
+def test_arange():
+    cases = [
+        (5, 103),        # Default step
+        (-2, 128, 3),
+        (4, 103, 5),
+        (10, -7, -1),
+        (100, -20, -3),
+        (0, 0),         # Empty array
+        (2, 10, -1),    # Empty array
+        (10, 3, 1),     # Empty array
+    ]
+    for case in cases:
+        result = pa.arange(*case)
+        result.validate(full=True)
+        assert result.equals(pa.array(list(range(*case)), type=pa.int64()))
+
+    # Validate memory_pool keyword argument
+    result = pa.arange(-1, 101, memory_pool=pa.default_memory_pool())
+    result.validate(full=True)
+    assert result.equals(pa.array(list(range(-1, 101)), type=pa.int64()))
+
+    # Special case for invalid step (arange does not accept step of 0)
+    with pytest.raises(pa.ArrowInvalid):
+        pa.arange(0, 10, 0)
+
+
 def test_array_diff():
     # ARROW-6252
     arr1 = pa.array(['foo'], type=pa.utf8())

Reply via email to