This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new daf31bec63 [Variant] Add `VariantBuilder::new_with_buffers` to write
to existing buffers (#7912)
daf31bec63 is described below
commit daf31bec63836f1fe7bb0a9fa1a98467546374fc
Author: Andrew Lamb <[email protected]>
AuthorDate: Sun Jul 13 08:03:44 2025 -0400
[Variant] Add `VariantBuilder::new_with_buffers` to write to existing
buffers (#7912)
# Which issue does this PR close?
- closes https://github.com/apache/arrow-rs/issues/7805
- part of https://github.com/apache/arrow-rs/issues/6736
- part of https://github.com/apache/arrow-rs/pull/7911
# Rationale for this change
I would like to be able to write Variants directly into the target
buffer when writing multiple variants However, the current
VariantBuilder allocates a new bufffer for each variant
# What changes are included in this PR?
1. Add `VariantBuilder::new_with_buffers` and docs and tests
You can see how this API can be used to write directly into a buffer in
VariantArrayBuilder in this PR:
- https://github.com/apache/arrow-rs/pull/7911
# Are these changes tested?
Yes new tests
# Are there any user-facing changes?
New API
---
parquet-variant/src/builder.rs | 199 +++++++++++++++++++++++++++++++++++++----
1 file changed, 184 insertions(+), 15 deletions(-)
diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs
index 33608d27cb..15ae9a9641 100644
--- a/parquet-variant/src/builder.rs
+++ b/parquet-variant/src/builder.rs
@@ -61,9 +61,35 @@ fn write_offset(buf: &mut Vec<u8>, value: usize, nbytes: u8)
{
buf.extend_from_slice(&bytes[..nbytes as usize]);
}
-#[derive(Default)]
+/// Wrapper around a `Vec<u8>` that provides methods for appending
+/// primitive values, variant types, and metadata.
+///
+/// This is used internally by the builders to construct the
+/// the `value` field for [`Variant`] values.
+///
+/// You can reuse an existing `Vec<u8>` by using the `from` impl
+#[derive(Debug, Default)]
struct ValueBuffer(Vec<u8>);
+impl ValueBuffer {
+ /// Construct a ValueBuffer that will write to a new underlying `Vec`
+ fn new() -> Self {
+ Default::default()
+ }
+}
+
+impl From<Vec<u8>> for ValueBuffer {
+ fn from(value: Vec<u8>) -> Self {
+ Self(value)
+ }
+}
+
+impl From<ValueBuffer> for Vec<u8> {
+ fn from(value_buffer: ValueBuffer) -> Self {
+ value_buffer.0
+ }
+}
+
impl ValueBuffer {
fn append_u8(&mut self, term: u8) {
self.0.push(term);
@@ -82,7 +108,7 @@ impl ValueBuffer {
}
fn into_inner(self) -> Vec<u8> {
- self.0
+ self.into()
}
fn inner_mut(&mut self) -> &mut Vec<u8> {
@@ -252,13 +278,31 @@ impl ValueBuffer {
}
}
-#[derive(Default)]
+/// Builder for constructing metadata for [`Variant`] values.
+///
+/// This is used internally by the [`VariantBuilder`] to construct the metadata
+///
+/// You can use an existing `Vec<u8>` as the metadata buffer by using the
`from` impl.
+#[derive(Default, Debug)]
struct MetadataBuilder {
// Field names -- field_ids are assigned in insert order
field_names: IndexSet<String>,
// flag that checks if field names by insertion order are also
lexicographically sorted
is_sorted: bool,
+
+ /// Output buffer. Metadata is written to the end of this buffer
+ metadata_buffer: Vec<u8>,
+}
+
+/// Create a new MetadataBuilder that will write to the specified metadata
buffer
+impl From<Vec<u8>> for MetadataBuilder {
+ fn from(metadata_buffer: Vec<u8>) -> Self {
+ Self {
+ metadata_buffer,
+ ..Default::default()
+ }
+ }
}
impl MetadataBuilder {
@@ -307,6 +351,12 @@ impl MetadataBuilder {
// Calculate metadata size
let total_dict_size: usize = self.metadata_size();
+ let Self {
+ field_names,
+ is_sorted,
+ mut metadata_buffer,
+ } = self;
+
// Determine appropriate offset size based on the larger of dict size
or total string size
let max_offset = std::cmp::max(total_dict_size, nkeys);
let offset_size = int_size(max_offset);
@@ -315,29 +365,29 @@ impl MetadataBuilder {
let string_start = offset_start + (nkeys + 1) * offset_size as usize;
let metadata_size = string_start + total_dict_size;
- let mut metadata = Vec::with_capacity(metadata_size);
+ metadata_buffer.reserve(metadata_size);
// Write header: version=1, field names are sorted, with calculated
offset_size
- metadata.push(0x01 | (self.is_sorted as u8) << 4 | ((offset_size - 1)
<< 6));
+ metadata_buffer.push(0x01 | (is_sorted as u8) << 4 | ((offset_size -
1) << 6));
// Write dictionary size
- write_offset(&mut metadata, nkeys, offset_size);
+ write_offset(&mut metadata_buffer, nkeys, offset_size);
// Write offsets
let mut cur_offset = 0;
- for key in self.field_names.iter() {
- write_offset(&mut metadata, cur_offset, offset_size);
+ for key in field_names.iter() {
+ write_offset(&mut metadata_buffer, cur_offset, offset_size);
cur_offset += key.len();
}
// Write final offset
- write_offset(&mut metadata, cur_offset, offset_size);
+ write_offset(&mut metadata_buffer, cur_offset, offset_size);
// Write string data
- for key in self.field_names {
- metadata.extend_from_slice(key.as_bytes());
+ for key in field_names {
+ metadata_buffer.extend_from_slice(key.as_bytes());
}
- metadata
+ metadata_buffer
}
}
@@ -570,6 +620,41 @@ impl ParentState<'_> {
/// );
///
/// ```
+/// # Example: Reusing Buffers
+///
+/// You can use the [`VariantBuilder`] to write into existing buffers (for
+/// example to write multiple variants back to back in the same buffer)
+///
+/// ```
+/// // we will write two variants back to back
+/// use parquet_variant::{Variant, VariantBuilder};
+/// // Append 12345
+/// let mut builder = VariantBuilder::new();
+/// builder.append_value(12345);
+/// let (metadata, value) = builder.finish();
+/// // remember where the first variant ends
+/// let (first_meta_offset, first_meta_len) = (0, metadata.len());
+/// let (first_value_offset, first_value_len) = (0, value.len());
+///
+/// // now, append a second variant to the same buffers
+/// let mut builder = VariantBuilder::new_with_buffers(metadata, value);
+/// builder.append_value("Foo");
+/// let (metadata, value) = builder.finish();
+///
+/// // The variants can be referenced in their appropriate location
+/// let variant1 = Variant::new(
+/// &metadata[first_meta_offset..first_meta_len],
+/// &value[first_value_offset..first_value_len]
+/// );
+/// assert_eq!(variant1, Variant::Int32(12345));
+///
+/// let variant2 = Variant::new(
+/// &metadata[first_meta_len..],
+/// &value[first_value_len..]
+/// );
+/// assert_eq!(variant2, Variant::from("Foo"));
+/// ```
+///
/// # Example: Unique Field Validation
///
/// This example shows how enabling unique field validation will cause an error
@@ -626,8 +711,7 @@ impl ParentState<'_> {
/// let (metadata, value) = builder.finish();
/// let variant = Variant::try_new(&metadata, &value).unwrap();
/// ```
-///
-#[derive(Default)]
+#[derive(Default, Debug)]
pub struct VariantBuilder {
buffer: ValueBuffer,
metadata_builder: MetadataBuilder,
@@ -635,14 +719,25 @@ pub struct VariantBuilder {
}
impl VariantBuilder {
+ /// Create a new VariantBuilder with new underlying buffer
pub fn new() -> Self {
Self {
- buffer: ValueBuffer::default(),
+ buffer: ValueBuffer::new(),
metadata_builder: MetadataBuilder::default(),
validate_unique_fields: false,
}
}
+ /// Create a new VariantBuilder that will write the metadata and values to
+ /// the specified buffers.
+ pub fn new_with_buffers(metadata_buffer: Vec<u8>, value_buffer: Vec<u8>)
-> Self {
+ Self {
+ buffer: ValueBuffer::from(value_buffer),
+ metadata_builder: MetadataBuilder::from(metadata_buffer),
+ validate_unique_fields: false,
+ }
+ }
+
/// Enables validation of unique field keys in nested objects.
///
/// This setting is propagated to all [`ObjectBuilder`]s created through
this [`VariantBuilder`]
@@ -1916,6 +2011,80 @@ mod tests {
assert_eq!(metadata.num_field_names(), 3);
}
+ /// Test reusing buffers with nested objects
+ #[test]
+ fn test_with_existing_buffers_nested() {
+ let mut builder = VariantBuilder::new();
+ append_test_list(&mut builder);
+ let (m1, v1) = builder.finish();
+ let variant1 = Variant::new(&m1, &v1);
+
+ let mut builder = VariantBuilder::new();
+ append_test_object(&mut builder);
+ let (m2, v2) = builder.finish();
+ let variant2 = Variant::new(&m2, &v2);
+
+ let mut builder = VariantBuilder::new();
+ builder.append_value("This is a string");
+ let (m3, v3) = builder.finish();
+ let variant3 = Variant::new(&m3, &v3);
+
+ // Now, append those three variants to the a new buffer that is reused
+ let mut builder = VariantBuilder::new();
+ append_test_list(&mut builder);
+ let (metadata, value) = builder.finish();
+ let (meta1_offset, meta1_end) = (0, metadata.len());
+ let (value1_offset, value1_end) = (0, value.len());
+
+ // reuse same buffer
+ let mut builder = VariantBuilder::new_with_buffers(metadata, value);
+ append_test_object(&mut builder);
+ let (metadata, value) = builder.finish();
+ let (meta2_offset, meta2_end) = (meta1_end, metadata.len());
+ let (value2_offset, value2_end) = (value1_end, value.len());
+
+ // Append a string
+ let mut builder = VariantBuilder::new_with_buffers(metadata, value);
+ builder.append_value("This is a string");
+ let (metadata, value) = builder.finish();
+ let (meta3_offset, meta3_end) = (meta2_end, metadata.len());
+ let (value3_offset, value3_end) = (value2_end, value.len());
+
+ // verify we can read the variants back correctly
+ let roundtrip1 = Variant::new(
+ &metadata[meta1_offset..meta1_end],
+ &value[value1_offset..value1_end],
+ );
+ assert_eq!(roundtrip1, variant1,);
+
+ let roundtrip2 = Variant::new(
+ &metadata[meta2_offset..meta2_end],
+ &value[value2_offset..value2_end],
+ );
+ assert_eq!(roundtrip2, variant2,);
+
+ let roundtrip3 = Variant::new(
+ &metadata[meta3_offset..meta3_end],
+ &value[value3_offset..value3_end],
+ );
+ assert_eq!(roundtrip3, variant3);
+ }
+
+ /// append a simple List variant
+ fn append_test_list(builder: &mut VariantBuilder) {
+ let mut list = builder.new_list();
+ list.append_value(1234);
+ list.append_value("a string value");
+ list.finish();
+ }
+
+ /// append an object variant
+ fn append_test_object(builder: &mut VariantBuilder) {
+ let mut obj = builder.new_object();
+ obj.insert("a", true);
+ obj.finish().unwrap();
+ }
+
#[test]
fn test_variant_builder_to_list_builder_no_finish() {
// Create a list builder but never finish it