This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 7b54614f57 GH-48347: [Ruby] Add support for reading list array (#48351)
7b54614f57 is described below
commit 7b54614f5758e17e93d4815e65657c14fccb2868
Author: Sutou Kouhei <[email protected]>
AuthorDate: Fri Dec 5 20:43:38 2025 +0900
GH-48347: [Ruby] Add support for reading list array (#48351)
### Rationale for this change
This is the first nested type.
### What changes are included in this PR?
* Add `ArrowFormat::ListType`
* Add `ArrowFormat::ListArray`
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes.
* GitHub Issue: #48347
Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
ruby/red-arrow-format/lib/arrow-format/array.rb | 19 ++++++
.../lib/arrow-format/file-reader.rb | 74 +++++++++++++---------
ruby/red-arrow-format/lib/arrow-format/type.rb | 13 +++-
ruby/red-arrow-format/test/test-file-reader.rb | 12 ++++
4 files changed, 88 insertions(+), 30 deletions(-)
diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb
b/ruby/red-arrow-format/lib/arrow-format/array.rb
index 6d164cc0b5..2a304f5416 100644
--- a/ruby/red-arrow-format/lib/arrow-format/array.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/array.rb
@@ -123,4 +123,23 @@ module ArrowFormat
Encoding::UTF_8
end
end
+
+ class ListArray < Array
+ def initialize(type, size, validity_buffer, offsets_buffer, child)
+ super(type, size, validity_buffer)
+ @offsets_buffer = offsets_buffer
+ @child = child
+ end
+
+ def to_a
+ child_values = @child.to_a
+ values = @offsets_buffer.
+ each(:s32, 0, @size + 1). # TODO: big endian support
+ each_cons(2).
+ collect do |(_, offset), (_, next_offset)|
+ child_values[offset...next_offset]
+ end
+ apply_validity(values)
+ end
+ end
end
diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
index 733140a10b..3db6bad77a 100644
--- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
@@ -22,14 +22,15 @@ require_relative "record-batch"
require_relative "schema"
require_relative "type"
+require_relative "org/apache/arrow/flatbuf/binary"
require_relative "org/apache/arrow/flatbuf/bool"
require_relative "org/apache/arrow/flatbuf/footer"
-require_relative "org/apache/arrow/flatbuf/message"
-require_relative "org/apache/arrow/flatbuf/binary"
require_relative "org/apache/arrow/flatbuf/int"
+require_relative "org/apache/arrow/flatbuf/list"
+require_relative "org/apache/arrow/flatbuf/message"
require_relative "org/apache/arrow/flatbuf/null"
-require_relative "org/apache/arrow/flatbuf/utf8"
require_relative "org/apache/arrow/flatbuf/schema"
+require_relative "org/apache/arrow/flatbuf/utf8"
module ArrowFormat
class FileReader
@@ -90,9 +91,10 @@ module ArrowFormat
when Org::Apache::Arrow::Flatbuf::RecordBatch
n_rows = header.length
columns = []
+ nodes = header.nodes
buffers = header.buffers
schema.fields.each do |field|
- columns << read_column(field, n_rows, buffers, body)
+ columns << read_column(field, nodes, buffers, body)
end
yield(RecordBatch.new(schema, n_rows, columns))
end
@@ -129,35 +131,44 @@ module ArrowFormat
Org::Apache::Arrow::Flatbuf::Footer.new(footer_data)
end
- def read_schema(fb_schema)
- fields = fb_schema.fields.collect do |fb_field|
- fb_type = fb_field.type
- case fb_type
- when Org::Apache::Arrow::Flatbuf::Null
- type = NullType.singleton
- when Org::Apache::Arrow::Flatbuf::Bool
- type = BooleanType.singleton
- when Org::Apache::Arrow::Flatbuf::Int
- case fb_type.bit_width
- when 8
- if fb_type.signed?
- type = Int8Type.singleton
- else
- type = UInt8Type.singleton
- end
+ def read_field(fb_field)
+ fb_type = fb_field.type
+ case fb_type
+ when Org::Apache::Arrow::Flatbuf::Null
+ type = NullType.singleton
+ when Org::Apache::Arrow::Flatbuf::Bool
+ type = BooleanType.singleton
+ when Org::Apache::Arrow::Flatbuf::Int
+ case fb_type.bit_width
+ when 8
+ if fb_type.signed?
+ type = Int8Type.singleton
+ else
+ type = UInt8Type.singleton
end
- when Org::Apache::Arrow::Flatbuf::Binary
- type = BinaryType.singleton
- when Org::Apache::Arrow::Flatbuf::Utf8
- type = UTF8Type.singleton
end
- Field.new(fb_field.name, type)
+ when Org::Apache::Arrow::Flatbuf::List
+ type = ListType.new(read_field(fb_field.children[0]))
+ when Org::Apache::Arrow::Flatbuf::Binary
+ type = BinaryType.singleton
+ when Org::Apache::Arrow::Flatbuf::Utf8
+ type = UTF8Type.singleton
+ end
+ Field.new(fb_field.name, type)
+ end
+
+ def read_schema(fb_schema)
+ fields = fb_schema.fields.collect do |fb_field|
+ read_field(fb_field)
end
Schema.new(fields)
end
- def read_column(field, n_rows, buffers, body)
- return field.type.build_array(n_rows) if field.type.is_a?(NullType)
+ def read_column(field, nodes, buffers, body)
+ node = nodes.shift
+ length = node.length
+
+ return field.type.build_array(length) if field.type.is_a?(NullType)
validity_buffer = buffers.shift
if validity_buffer.length.zero?
@@ -172,14 +183,19 @@ module ArrowFormat
UInt8Type
values_buffer = buffers.shift
values = body.slice(values_buffer.offset, values_buffer.length)
- field.type.build_array(n_rows, validity, values)
+ field.type.build_array(length, validity, values)
+ when ListType
+ offsets_buffer = buffers.shift
+ offsets = body.slice(offsets_buffer.offset, offsets_buffer.length)
+ child = read_column(field.type.child, nodes, buffers, body)
+ field.type.build_array(length, validity, offsets, child)
when BinaryType,
UTF8Type
offsets_buffer = buffers.shift
values_buffer = buffers.shift
offsets = body.slice(offsets_buffer.offset, offsets_buffer.length)
values = body.slice(values_buffer.offset, values_buffer.length)
- field.type.build_array(n_rows, validity, offsets, values)
+ field.type.build_array(length, validity, offsets, values)
end
end
end
diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb
b/ruby/red-arrow-format/lib/arrow-format/type.rb
index 22a246aeab..c792eac175 100644
--- a/ruby/red-arrow-format/lib/arrow-format/type.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/type.rb
@@ -103,7 +103,6 @@ module ArrowFormat
end
end
- attr_reader :name
def initialize
super("Binary")
end
@@ -129,4 +128,16 @@ module ArrowFormat
UTF8Array.new(self, size, validity_buffer, offsets_buffer, values_buffer)
end
end
+
+ class ListType < Type
+ attr_reader :child
+ def initialize(child)
+ super("List")
+ @child = child
+ end
+
+ def build_array(size, validity_buffer, offsets_buffer, child)
+ ListArray.new(self, size, validity_buffer, offsets_buffer, child)
+ end
+ end
end
diff --git a/ruby/red-arrow-format/test/test-file-reader.rb
b/ruby/red-arrow-format/test/test-file-reader.rb
index 0029a57887..95cb6f3b1a 100644
--- a/ruby/red-arrow-format/test/test-file-reader.rb
+++ b/ruby/red-arrow-format/test/test-file-reader.rb
@@ -105,4 +105,16 @@ class TestFileReader < Test::Unit::TestCase
read)
end
end
+
+ sub_test_case("List") do
+ def build_array
+ data_type = Arrow::ListDataType.new(name: "count", type: :int8)
+ Arrow::ListArray.new(data_type, [[-128, 127], nil, [-1, 0, 1]])
+ end
+
+ def test_read
+ assert_equal([{"value" => [[-128, 127], nil, [-1, 0, 1]]}],
+ read)
+ end
+ end
end