This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 8974ddc5a5 GH-48425: [Ruby] Add support for reading dense union array
(#48426)
8974ddc5a5 is described below
commit 8974ddc5a51c25c8f47054fd10c689d38320845a
Author: Sutou Kouhei <[email protected]>
AuthorDate: Thu Dec 11 07:06:53 2025 +0900
GH-48425: [Ruby] Add support for reading dense union array (#48426)
### Rationale for this change
It's a dense variant of union array.
### What changes are included in this PR?
* Add `ArrowFormat::DenseUnionType`
* Add `ArrowFormat::DenseUnionArray`
### Are these changes tested?
Yes.
### Are there any user-facing changes?
Yes.
* GitHub Issue: #48425
Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
ruby/red-arrow-format/lib/arrow-format/array.rb | 25 +++++++++++++++++++++
.../lib/arrow-format/file-reader.rb | 18 +++++++++++++++
ruby/red-arrow-format/lib/arrow-format/type.rb | 25 +++++++++++++++++++++
ruby/red-arrow-format/test/test-file-reader.rb | 26 ++++++++++++++++++++++
4 files changed, 94 insertions(+)
diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb
b/ruby/red-arrow-format/lib/arrow-format/array.rb
index fac39c609a..c4220a4367 100644
--- a/ruby/red-arrow-format/lib/arrow-format/array.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/array.rb
@@ -231,6 +231,31 @@ module ArrowFormat
end
end
+ class UnionArray < Array
+ def initialize(type,
+ size,
+ types_buffer,
+ offsets_buffer,
+ children)
+ super(type, size, nil)
+ @types_buffer = types_buffer
+ @offsets_buffer = offsets_buffer
+ @children = children
+ end
+ end
+
+ class DenseUnionArray < UnionArray
+ def to_a
+ children_values = @children.collect(&:to_a)
+ types = @types_buffer.each(:S8, 0, @size)
+ offsets = @offsets_buffer.each(:s32, 0, @size)
+ types.zip(offsets).collect do |(_, type), (_, offset)|
+ index = @type.resolve_type_index(type)
+ children_values[index][offset]
+ end
+ end
+ end
+
class MapArray < VariableSizeListArray
def to_a
super.collect do |entries|
diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
index 4a46382685..68b3c4b64c 100644
--- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb
@@ -38,6 +38,8 @@ require_relative "org/apache/arrow/flatbuf/null"
require_relative "org/apache/arrow/flatbuf/precision"
require_relative "org/apache/arrow/flatbuf/schema"
require_relative "org/apache/arrow/flatbuf/struct_"
+require_relative "org/apache/arrow/flatbuf/union"
+require_relative "org/apache/arrow/flatbuf/union_mode"
require_relative "org/apache/arrow/flatbuf/utf8"
module ArrowFormat
@@ -176,6 +178,13 @@ module ArrowFormat
when Org::Apache::Arrow::Flatbuf::Struct
children = fb_field.children.collect {|child| read_field(child)}
type = StructType.new(children)
+ when Org::Apache::Arrow::Flatbuf::Union
+ children = fb_field.children.collect {|child| read_field(child)}
+ type_ids = fb_type.type_ids
+ case fb_type.mode
+ when Org::Apache::Arrow::Flatbuf::UnionMode::DENSE
+ type = DenseUnionType.new(children, type_ids)
+ end
when Org::Apache::Arrow::Flatbuf::Map
type = MapType.new(read_field(fb_field.children[0]))
when Org::Apache::Arrow::Flatbuf::Binary
@@ -225,6 +234,15 @@ module ArrowFormat
read_column(child, nodes, buffers, body)
end
field.type.build_array(length, validity, children)
+ when UnionType
+ # union type doesn't have validity.
+ types = validity
+ offsets_buffer = buffers.shift
+ offsets = body.slice(offsets_buffer.offset, offsets_buffer.length)
+ children = field.type.children.collect do |child|
+ read_column(child, nodes, buffers, body)
+ end
+ field.type.build_array(length, types, offsets, children)
when VariableSizeBinaryType
offsets_buffer = buffers.shift
values_buffer = buffers.shift
diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb
b/ruby/red-arrow-format/lib/arrow-format/type.rb
index 5516a5807f..c783d87987 100644
--- a/ruby/red-arrow-format/lib/arrow-format/type.rb
+++ b/ruby/red-arrow-format/lib/arrow-format/type.rb
@@ -296,4 +296,29 @@ module ArrowFormat
MapArray.new(self, size, validity_buffer, offsets_buffer, child)
end
end
+
+ class UnionType < Type
+ attr_reader :children
+ attr_reader :type_ids
+ def initialize(name, children, type_ids)
+ super(name)
+ @children = children
+ @type_ids = type_ids
+ @type_indexes = {}
+ end
+
+ def resolve_type_index(type)
+ @type_indexes[type] ||= @type_ids.index(type)
+ end
+ end
+
+ class DenseUnionType < UnionType
+ def initialize(children, type_ids)
+ super("DenseUnion", children, type_ids)
+ end
+
+ def build_array(size, types_buffer, offsets_buffer, children)
+ DenseUnionArray.new(self, size, types_buffer, offsets_buffer, children)
+ end
+ end
end
diff --git a/ruby/red-arrow-format/test/test-file-reader.rb
b/ruby/red-arrow-format/test/test-file-reader.rb
index 72b818314b..2b37a888f7 100644
--- a/ruby/red-arrow-format/test/test-file-reader.rb
+++ b/ruby/red-arrow-format/test/test-file-reader.rb
@@ -230,6 +230,32 @@ class TestFileReader < Test::Unit::TestCase
end
end
+ sub_test_case("DenseUnion") do
+ def build_array
+ fields = [
+ Arrow::Field.new("number", :int8),
+ Arrow::Field.new("text", :string),
+ ]
+ type_ids = [11, 13]
+ data_type = Arrow::DenseUnionDataType.new(fields, type_ids)
+ types = Arrow::Int8Array.new([11, 13, 11, 13, 13])
+ value_offsets = Arrow::Int32Array.new([0, 0, 1, 1, 2])
+ children = [
+ Arrow::Int8Array.new([1, nil]),
+ Arrow::StringArray.new(["a", "b", "c"])
+ ]
+ Arrow::DenseUnionArray.new(data_type,
+ types,
+ value_offsets,
+ children)
+ end
+
+ def test_read
+ assert_equal([{"value" => [1, "a", nil, "b", "c"]}],
+ read)
+ end
+ end
+
sub_test_case("Map") do
def build_array
data_type = Arrow::MapDataType.new(:string, :int8)