neilconway commented on code in PR #21789:
URL: https://github.com/apache/datafusion/pull/21789#discussion_r3131981689


##########
datafusion/functions/src/strings.rs:
##########
@@ -414,6 +430,223 @@ impl ConcatLargeStringBuilder {
     }
 }
 
+// ----------------------------------------------------------------------------
+// Bulk-nulls builders
+//
+// These builders are similar to Arrow's `GenericStringBuilder` and
+// `StringViewBuilder`, except that callers must pass the NULL bitmap to
+// `finish()`, rather than maintaining it iteratively (per-row). For callers
+// that can compute the NULL bitmap in bulk (which is true of many
+// string-related UDFs), this can be significantly more efficient.
+//
+// For a row known to be null, call `append_placeholder` to advance the row
+// count without touching the value buffer; the placeholder slot will be
+// masked by the caller-supplied null buffer.
+// ----------------------------------------------------------------------------
+
+/// Builder for a [`GenericStringArray<O>`] that defers null tracking to
+/// `finish`. Instantiate with `O = i32` for [`StringArray`] (Utf8) or
+/// `O = i64` for [`LargeStringArray`] (LargeUtf8).
+pub(crate) struct GenericStringArrayBuilder<O: OffsetSizeTrait> {
+    offsets_buffer: MutableBuffer,
+    value_buffer: MutableBuffer,
+    _phantom: PhantomData<O>,
+}
+
+impl<O: OffsetSizeTrait> GenericStringArrayBuilder<O> {
+    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+        let capacity = item_capacity
+            .checked_add(1)
+            .map(|i| i.saturating_mul(size_of::<O>()))
+            .expect("capacity integer overflow");
+
+        let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
+        offsets_buffer.push(O::usize_as(0));
+        Self {
+            offsets_buffer,
+            value_buffer: MutableBuffer::with_capacity(data_capacity),
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Append `value` as the next row.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the cumulative byte length exceeds `O::MAX`.
+    pub fn append_value(&mut self, value: &str) {
+        self.value_buffer.extend_from_slice(value.as_bytes());
+        let next_offset =
+            O::from_usize(self.value_buffer.len()).expect("byte array offset 
overflow");
+        self.offsets_buffer.push(next_offset);
+    }
+
+    /// Append an empty placeholder row. The corresponding slot is meaningful
+    /// only if the caller masks it via the null buffer passed to `finish`.
+    pub fn append_placeholder(&mut self) {
+        let next_offset =
+            O::from_usize(self.value_buffer.len()).expect("byte array offset 
overflow");
+        self.offsets_buffer.push(next_offset);
+    }
+
+    /// Finalize into a [`GenericStringArray<O>`] using the caller-supplied
+    /// null buffer.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error when `null_buffer.len()` does not match the number of
+    /// appended rows.
+    pub fn finish(
+        self,
+        null_buffer: Option<NullBuffer>,
+    ) -> Result<GenericStringArray<O>> {
+        let row_count = self.offsets_buffer.len() / size_of::<O>() - 1;
+        if let Some(ref n) = null_buffer
+            && n.len() != row_count
+        {
+            return internal_err!(
+                "Null buffer length ({}) must match row count ({row_count})",
+                n.len()
+            );
+        }
+        let array_data = 
ArrayDataBuilder::new(GenericStringArray::<O>::DATA_TYPE)
+            .len(row_count)
+            .add_buffer(self.offsets_buffer.into())
+            .add_buffer(self.value_buffer.into())
+            .nulls(null_buffer);
+        // SAFETY: every appended value came from a `&str`, so the value
+        // buffer is valid UTF-8 and offsets are monotonically non-decreasing.
+        let array_data = unsafe { array_data.build_unchecked() };
+        Ok(GenericStringArray::<O>::from(array_data))
+    }
+}
+
+/// Starting size for the long-string data block; matches Arrow's
+/// `GenericByteViewBuilder` default.
+const STARTING_BLOCK_SIZE: u32 = 8 * 1024;
+/// Maximum size each long-string data block grows to; matches Arrow's
+/// `GenericByteViewBuilder` default.
+const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024;
+
+/// Builder for a [`StringViewArray`] that defers null tracking to `finish`.
+///
+/// Modeled on Arrow's [`arrow::array::builder::StringViewBuilder`] but
+/// without per-row [`arrow::array::builder::NullBufferBuilder`] maintenance.
+/// Long strings (> 12 bytes) are appended into an in-progress data block;
+/// short strings are inlined into the view itself. When the in-progress block
+/// fills up it is flushed into `completed` and a new block — double the size
+/// of the last, capped at [`MAX_BLOCK_SIZE`] — is started.
+pub(crate) struct StringViewArrayBuilder {
+    views: Vec<u128>,
+    in_progress: Vec<u8>,
+    completed: Vec<Buffer>,
+    /// Current block-size target; doubles each time a block is flushed, up to
+    /// [`MAX_BLOCK_SIZE`].
+    block_size: u32,
+}
+
+impl StringViewArrayBuilder {
+    pub fn with_capacity(item_capacity: usize) -> Self {
+        Self {
+            views: Vec::with_capacity(item_capacity),
+            in_progress: Vec::new(),
+            completed: Vec::new(),
+            block_size: STARTING_BLOCK_SIZE,
+        }
+    }
+
+    /// Doubles the block-size target (capped at [`MAX_BLOCK_SIZE`]) and
+    /// returns the new size. The first call returns `2 * STARTING_BLOCK_SIZE`.
+    fn next_block_size(&mut self) -> u32 {
+        if self.block_size < MAX_BLOCK_SIZE {
+            self.block_size = self.block_size.saturating_mul(2);
+        }
+        self.block_size
+    }
+
+    /// Append `value` as the next row.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the value length, the in-progress buffer offset, or the
+    /// number of completed buffers exceeds `i32::MAX`. The ByteView spec
+    /// uses signed 32-bit integers for these fields; exceeding `i32::MAX`
+    /// would produce an array that does not round-trip through Arrow IPC
+    /// (see <https://github.com/apache/arrow-rs/issues/6172>).
+    #[inline]
+    pub fn append_value(&mut self, value: &str) {
+        let v = value.as_bytes();
+        let length: u32 =
+            i32::try_from(v.len()).expect("value length exceeds i32::MAX") as 
u32;
+        if length <= 12 {
+            self.views.push(make_view(v, 0, 0));
+            return;
+        }
+
+        let required_cap = self.in_progress.len() + length as usize;
+        if self.in_progress.capacity() < required_cap {
+            self.flush_in_progress();
+            let to_reserve = (length as usize).max(self.next_block_size() as 
usize);
+            self.in_progress.reserve(to_reserve);
+        }
+
+        let buffer_index: u32 = i32::try_from(self.completed.len())
+            .expect("buffer count exceeds i32::MAX")
+            as u32;
+        let offset: u32 = i32::try_from(self.in_progress.len())
+            .expect("offset exceeds i32::MAX") as u32;
+        self.in_progress.extend_from_slice(v);
+        self.views.push(make_view(v, buffer_index, offset));
+    }
+
+    /// Append an empty placeholder row. The corresponding slot is meaningful
+    /// only if the caller masks it via the null buffer passed to `finish`.
+    #[inline]
+    pub fn append_placeholder(&mut self) {

Review Comment:
   Yeah, I was considering whether to do that. It wouldn't be the first 
Arrow/DF API that has some footguns :) But on reflection I think the 
performance cost of tracking the placeholder count is pretty tiny and probably 
worth doing; added.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to