This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 6d655698f3 GH-48360: [Ruby] Add support for reading large binary array
(#48361)
6d655698f3 is described below
commit 6d655698f32a0f16db3ea3c3878158062dc2a10c
Author: Sutou Kouhei <[email protected]>
AuthorDate: Sun Dec 7 14:37:44 2025 +0900
GH-48360: [Ruby] Add support for reading large binary array (#48361)
### Rationale for this change
It's the 64 bit offset version of binary array.
### What changes are included in this PR?
* Add `ArrowFormat::LargeBinaryType`
* Add `ArrowFormat::LargeBinaryArray`
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes.
* GitHub Issue: #48360
Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
ruby/red-arrow-format/lib/arrow-format/array.rb | 21 ++++++++++++++++-
.../lib/arrow-format/file-reader.rb | 6 +++--
ruby/red-arrow-format/lib/arrow-format/type.rb | 27 ++++++++++++++++++++--
ruby/red-arrow-format/test/test-file-reader.rb | 11 +++++++++
4 files changed, 60 insertions(+), 5 deletions(-)
diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb
b/ruby/red-arrow-format/lib/arrow-format/array.rb
index f3c3c49233..d4995cda3e 100644
--- a/ruby/red-arrow-format/lib/arrow-format/array.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/array.rb
@@ -113,7 +113,7 @@ module ArrowFormat
def to_a
values = @offsets_buffer.
- each(:s32, 0, @size + 1). # TODO: big endian support
+ each(buffer_type, 0, @size + 1).
each_cons(2).
collect do |(_, offset), (_, next_offset)|
length = next_offset - offset
@@ -125,6 +125,21 @@ module ArrowFormat
class BinaryArray < VariableSizeBinaryLayoutArray
private
+ def buffer_type
+ :s32 # TODO: big endian support
+ end
+
+ def encoding
+ Encoding::ASCII_8BIT
+ end
+ end
+
+ class LargeBinaryArray < VariableSizeBinaryLayoutArray
+ private
+ def buffer_type
+ :s64 # TODO: big endian support
+ end
+
def encoding
Encoding::ASCII_8BIT
end
@@ -132,6 +147,10 @@ module ArrowFormat
class UTF8Array < VariableSizeBinaryLayoutArray
private
+ def buffer_type
+ :s32 # TODO: big endian support
+ end
+
def encoding
Encoding::UTF_8
end
diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
index edc866b3f0..acd21b9764 100644
--- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
@@ -27,6 +27,7 @@ require_relative "org/apache/arrow/flatbuf/bool"
require_relative "org/apache/arrow/flatbuf/floating_point"
require_relative "org/apache/arrow/flatbuf/footer"
require_relative "org/apache/arrow/flatbuf/int"
+require_relative "org/apache/arrow/flatbuf/large_binary"
require_relative "org/apache/arrow/flatbuf/list"
require_relative "org/apache/arrow/flatbuf/message"
require_relative "org/apache/arrow/flatbuf/null"
@@ -158,6 +159,8 @@ module ArrowFormat
type = ListType.new(read_field(fb_field.children[0]))
when Org::Apache::Arrow::Flatbuf::Binary
type = BinaryType.singleton
+ when Org::Apache::Arrow::Flatbuf::LargeBinary
+ type = LargeBinaryType.singleton
when Org::Apache::Arrow::Flatbuf::Utf8
type = UTF8Type.singleton
end
@@ -196,8 +199,7 @@ module ArrowFormat
offsets = body.slice(offsets_buffer.offset, offsets_buffer.length)
child = read_column(field.type.child, nodes, buffers, body)
field.type.build_array(length, validity, offsets, child)
- when BinaryType,
- UTF8Type
+ when VariableSizeBinaryType
offsets_buffer = buffers.shift
values_buffer = buffers.shift
offsets = body.slice(offsets_buffer.offset, offsets_buffer.length)
diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb
b/ruby/red-arrow-format/lib/arrow-format/type.rb
index 75586c2f35..b656395634 100644
--- a/ruby/red-arrow-format/lib/arrow-format/type.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/type.rb
@@ -120,7 +120,10 @@ module ArrowFormat
end
end
- class BinaryType < Type
+ class VariableSizeBinaryType < Type
+ end
+
+ class BinaryType < VariableSizeBinaryType
class << self
def singleton
@singleton ||= new
@@ -136,7 +139,27 @@ module ArrowFormat
end
end
- class UTF8Type < Type
+ class LargeBinaryType < VariableSizeBinaryType
+ class << self
+ def singleton
+ @singleton ||= new
+ end
+ end
+
+ def initialize
+ super("LargeBinary")
+ end
+
+ def build_array(size, validity_buffer, offsets_buffer, values_buffer)
+ LargeBinaryArray.new(self,
+ size,
+ validity_buffer,
+ offsets_buffer,
+ values_buffer)
+ end
+ end
+
+ class UTF8Type < VariableSizeBinaryType
class << self
def singleton
@singleton ||= new
diff --git a/ruby/red-arrow-format/test/test-file-reader.rb
b/ruby/red-arrow-format/test/test-file-reader.rb
index 02685b1987..b39d7b1fff 100644
--- a/ruby/red-arrow-format/test/test-file-reader.rb
+++ b/ruby/red-arrow-format/test/test-file-reader.rb
@@ -106,6 +106,17 @@ class TestFileReader < Test::Unit::TestCase
end
end
+ sub_test_case("LargeBinary") do
+ def build_array
+ Arrow::LargeBinaryArray.new(["Hello".b, nil, "World".b])
+ end
+
+ def test_read
+ assert_equal([{"value" => ["Hello".b, nil, "World".b]}],
+ read)
+ end
+ end
+
sub_test_case("UTF8") do
def build_array
Arrow::StringArray.new(["Hello", nil, "World"])