This is an automated email from the ASF dual-hosted git repository.

zanmato pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 86166d5874 GH-43660: [C++][Compute] Avoid ZeroCopyCastExec when 
casting Binary offset -> Binary offset types (#48171)
86166d5874 is described below

commit 86166d5874555465e39d827c1a4dfa631a8c76c2
Author: scott-routledge2 <[email protected]>
AuthorDate: Wed Dec 17 02:57:12 2025 -0500

    GH-43660: [C++][Compute] Avoid ZeroCopyCastExec when casting Binary offset 
-> Binary offset types (#48171)
    
    ### Rationale for this change
    
    Casting Binary offset -> Binary offset types relies on ZeroCopyCastExec, 
which propagates the offset of the input to the output. This can lead to larger 
allocations than necessary when casting arrays with offsets.
    
    See https://github.com/apache/arrow/issues/43660 and
    https://github.com/apache/arrow/pull/43661 for more context.
    
    ### What changes are included in this PR?
    
    Ensure output array has a small offset (it can still be non-zero since 
reusing the null bitmap requires in_offset % 8 == out_offset % 8)
    
    ### Are these changes tested?
    
    Ran unit tests and benchmarked locally.
    
    ### Are there any user-facing changes?
    
    No
    
    * GitHub Issue: #43660
    
    Authored-by: Scott Routledge <[email protected]>
    Signed-off-by: Rossi Sun <[email protected]>
---
 .../arrow/compute/kernels/scalar_cast_string.cc    | 32 +++++++++++++++--
 cpp/src/arrow/compute/kernels/scalar_cast_test.cc  | 40 ++++++++++++++++++++++
 2 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc 
b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 3442d46f16..4d0aa943ed 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -20,12 +20,14 @@
 
 #include "arrow/array/array_base.h"
 #include "arrow/array/builder_binary.h"
+#include "arrow/buffer.h"
 #include "arrow/compute/kernels/base_arithmetic_internal.h"
 #include "arrow/compute/kernels/codegen_internal.h"
 #include "arrow/compute/kernels/common_internal.h"
 #include "arrow/compute/kernels/scalar_cast_internal.h"
 #include "arrow/compute/kernels/temporal_internal.h"
 #include "arrow/result.h"
+#include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/formatting.h"
@@ -304,10 +306,34 @@ BinaryToBinaryCastExec(KernelContext* ctx, const 
ExecSpan& batch, ExecResult* ou
     }
   }
 
-  // Start with a zero-copy cast, but change indices to expected size
+  // Start with a zero-copy cast, but change indices to the correct size and 
set validity
+  // bitmap and offset if needed.
   RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
-  return CastBinaryToBinaryOffsets<typename I::offset_type, typename 
O::offset_type>(
-      ctx, input, out->array_data().get());
+  if constexpr (sizeof(typename I::offset_type) != sizeof(typename 
O::offset_type)) {
+    std::shared_ptr<ArrayData> input_arr = input.ToArrayData();
+    ArrayData* output = out->array_data().get();
+
+    // Slice buffers to minimize the output's offset. We need a small offset 
because
+    // CastBinaryToBinaryOffsets() will reallocate the offsets buffer with size
+    // (out_length + out_offset + 1) * sizeof(offset_type).
+    int64_t input_offset = input_arr->offset;
+    size_t input_offset_type_size = sizeof(typename I::offset_type);
+    if (output->null_count != 0 && output->buffers[0]) {
+      // Avoid reallocation of the validity buffer by allowing some padding 
bits
+      output->offset = input_offset % 8;
+    } else {
+      output->offset = 0;
+    }
+    if (output->buffers[0]) {
+      output->buffers[0] = SliceBuffer(output->buffers[0], input_offset / 8);
+    }
+    output->buffers[1] = SliceBuffer(
+        output->buffers[1], (input_offset - output->offset) * 
input_offset_type_size);
+
+    return CastBinaryToBinaryOffsets<typename I::offset_type, typename 
O::offset_type>(
+        ctx, input, out->array_data().get());
+  }
+  return Status::OK();
 }
 
 // String View -> Offset String
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc 
b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 44b50b31f7..2589756a07 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -3400,6 +3400,46 @@ TEST(Cast, StringToString) {
   }
 }
 
+TEST(Cast, StringToStringWithOffset) {
+  // GH-43660: Check casting String Arrays with nonzero offset
+  std::vector<int64_t> offsets = {3, 8, 10, 12};
+  std::vector<int64_t> lengths = {5, 2, 1, 0};
+
+  for (auto from_type : {utf8(), large_utf8()}) {
+    for (auto to_type : {utf8(), large_utf8()}) {
+      for (size_t i = 0; i < offsets.size(); ++i) {
+        auto offset = offsets[i];
+        auto length = lengths[i];
+
+        auto input_with_nulls = R"([
+          "foo", null, "bar", null, "quu", "foo", "baz", "bar",
+          null, "bar", "baz", null
+          ])";
+
+        auto input_arr_with_nulls = ArrayFromJSON(from_type, input_with_nulls);
+        auto output_arr_with_nulls = ArrayFromJSON(to_type, input_with_nulls);
+        CheckCast(input_arr_with_nulls->Slice(offset),
+                  output_arr_with_nulls->Slice(offset));
+        // Slice with length
+        CheckCast(input_arr_with_nulls->Slice(offset, length),
+                  output_arr_with_nulls->Slice(offset, length));
+
+        auto input_no_nulls = R"([
+            "foo", "aa", "bar", "bb", "quu", "foo", "baz", "bar",
+            "cc", "bar", "baz", "foo"
+            ])";
+
+        auto input_arr_no_nulls = ArrayFromJSON(from_type, input_no_nulls);
+        auto output_arr_no_nulls = ArrayFromJSON(to_type, input_no_nulls);
+        CheckCast(input_arr_no_nulls->Slice(offset), 
output_arr_no_nulls->Slice(offset));
+        // Slice with length
+        CheckCast(input_arr_no_nulls->Slice(offset, length),
+                  output_arr_no_nulls->Slice(offset, length));
+      }
+    }
+  }
+}
+
 TEST(Cast, BinaryOrStringToFixedSizeBinary) {
   for (auto in_type :
        {utf8(), large_utf8(), utf8_view(), binary(), binary_view(), 
large_binary()}) {

Reply via email to