EeshanBembi commented on code in PR #21789:
URL: https://github.com/apache/datafusion/pull/21789#discussion_r3130157145
##########
datafusion/functions/src/strings.rs:
##########
@@ -414,6 +430,223 @@ impl ConcatLargeStringBuilder {
}
}
+// ----------------------------------------------------------------------------
+// Bulk-nulls builders
+//
+// These builders are similar to Arrow's `GenericStringBuilder` and
+// `StringViewBuilder`, except that callers must pass the NULL bitmap to
+// `finish()`, rather than maintaining it iteratively (per-row). For callers
+// that can compute the NULL bitmap in bulk (which is true of many
+// string-related UDFs), this can be significantly more efficient.
+//
+// For a row known to be null, call `append_placeholder` to advance the row
+// count without touching the value buffer; the placeholder slot will be
+// masked by the caller-supplied null buffer.
+// ----------------------------------------------------------------------------
+
+/// Builder for a [`GenericStringArray<O>`] that defers null tracking to
+/// `finish`. Instantiate with `O = i32` for [`StringArray`] (Utf8) or
+/// `O = i64` for [`LargeStringArray`] (LargeUtf8).
+pub(crate) struct GenericStringArrayBuilder<O: OffsetSizeTrait> {
+ offsets_buffer: MutableBuffer,
+ value_buffer: MutableBuffer,
+ _phantom: PhantomData<O>,
+}
+
+impl<O: OffsetSizeTrait> GenericStringArrayBuilder<O> {
+ pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+ let capacity = item_capacity
+ .checked_add(1)
+ .map(|i| i.saturating_mul(size_of::<O>()))
+ .expect("capacity integer overflow");
+
+ let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
+ offsets_buffer.push(O::usize_as(0));
+ Self {
+ offsets_buffer,
+ value_buffer: MutableBuffer::with_capacity(data_capacity),
+ _phantom: PhantomData,
+ }
+ }
+
+ /// Append `value` as the next row.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the cumulative byte length exceeds `O::MAX`.
+ pub fn append_value(&mut self, value: &str) {
+ self.value_buffer.extend_from_slice(value.as_bytes());
+ let next_offset =
+ O::from_usize(self.value_buffer.len()).expect("byte array offset
overflow");
+ self.offsets_buffer.push(next_offset);
+ }
+
+ /// Append an empty placeholder row. The corresponding slot is meaningful
+ /// only if the caller masks it via the null buffer passed to `finish`.
+ pub fn append_placeholder(&mut self) {
+ let next_offset =
+ O::from_usize(self.value_buffer.len()).expect("byte array offset
overflow");
+ self.offsets_buffer.push(next_offset);
+ }
+
+ /// Finalize into a [`GenericStringArray<O>`] using the caller-supplied
+ /// null buffer.
+ ///
+ /// # Errors
+ ///
+ /// Returns an error when `null_buffer.len()` does not match the number of
+ /// appended rows.
+ pub fn finish(
+ self,
+ null_buffer: Option<NullBuffer>,
+ ) -> Result<GenericStringArray<O>> {
+ let row_count = self.offsets_buffer.len() / size_of::<O>() - 1;
+ if let Some(ref n) = null_buffer
+ && n.len() != row_count
+ {
+ return internal_err!(
+ "Null buffer length ({}) must match row count ({row_count})",
+ n.len()
+ );
+ }
+ let array_data =
ArrayDataBuilder::new(GenericStringArray::<O>::DATA_TYPE)
+ .len(row_count)
+ .add_buffer(self.offsets_buffer.into())
+ .add_buffer(self.value_buffer.into())
+ .nulls(null_buffer);
+ // SAFETY: every appended value came from a `&str`, so the value
+ // buffer is valid UTF-8 and offsets are monotonically non-decreasing.
+ let array_data = unsafe { array_data.build_unchecked() };
+ Ok(GenericStringArray::<O>::from(array_data))
+ }
+}
+
+/// Starting size for the long-string data block; matches Arrow's
+/// `GenericByteViewBuilder` default.
+const STARTING_BLOCK_SIZE: u32 = 8 * 1024;
+/// Maximum size each long-string data block grows to; matches Arrow's
+/// `GenericByteViewBuilder` default.
+const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024;
+
+/// Builder for a [`StringViewArray`] that defers null tracking to `finish`.
+///
+/// Modeled on Arrow's [`arrow::array::builder::StringViewBuilder`] but
+/// without per-row [`arrow::array::builder::NullBufferBuilder`] maintenance.
+/// Long strings (> 12 bytes) are appended into an in-progress data block;
+/// short strings are inlined into the view itself. When the in-progress block
+/// fills up it is flushed into `completed` and a new block — double the size
+/// of the last, capped at [`MAX_BLOCK_SIZE`] — is started.
+pub(crate) struct StringViewArrayBuilder {
+ views: Vec<u128>,
+ in_progress: Vec<u8>,
+ completed: Vec<Buffer>,
+ /// Current block-size target; doubles each time a block is flushed, up to
+ /// [`MAX_BLOCK_SIZE`].
+ block_size: u32,
+}
+
+impl StringViewArrayBuilder {
+ pub fn with_capacity(item_capacity: usize) -> Self {
+ Self {
+ views: Vec::with_capacity(item_capacity),
+ in_progress: Vec::new(),
+ completed: Vec::new(),
+ block_size: STARTING_BLOCK_SIZE,
+ }
+ }
+
+ /// Doubles the block-size target (capped at [`MAX_BLOCK_SIZE`]) and
+ /// returns the new size. The first call returns `2 * STARTING_BLOCK_SIZE`.
+ fn next_block_size(&mut self) -> u32 {
+ if self.block_size < MAX_BLOCK_SIZE {
+ self.block_size = self.block_size.saturating_mul(2);
+ }
+ self.block_size
+ }
+
+ /// Append `value` as the next row.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the value length, the in-progress buffer offset, or the
+ /// number of completed buffers exceeds `i32::MAX`. The ByteView spec
+ /// uses signed 32-bit integers for these fields; exceeding `i32::MAX`
+ /// would produce an array that does not round-trip through Arrow IPC
+ /// (see <https://github.com/apache/arrow-rs/issues/6172>).
+ #[inline]
+ pub fn append_value(&mut self, value: &str) {
+ let v = value.as_bytes();
+ let length: u32 =
+ i32::try_from(v.len()).expect("value length exceeds i32::MAX") as
u32;
+ if length <= 12 {
+ self.views.push(make_view(v, 0, 0));
+ return;
+ }
+
+ let required_cap = self.in_progress.len() + length as usize;
+ if self.in_progress.capacity() < required_cap {
+ self.flush_in_progress();
+ let to_reserve = (length as usize).max(self.next_block_size() as
usize);
+ self.in_progress.reserve(to_reserve);
+ }
+
+ let buffer_index: u32 = i32::try_from(self.completed.len())
+ .expect("buffer count exceeds i32::MAX")
+ as u32;
+ let offset: u32 = i32::try_from(self.in_progress.len())
+ .expect("offset exceeds i32::MAX") as u32;
+ self.in_progress.extend_from_slice(v);
+ self.views.push(make_view(v, buffer_index, offset));
+ }
+
+ /// Append an empty placeholder row. The corresponding slot is meaningful
+ /// only if the caller masks it via the null buffer passed to `finish`.
+ #[inline]
+ pub fn append_placeholder(&mut self) {
Review Comment:
`append_placeholder()` pushes a zero-length view (`0u128`) that is
byte-for-byte identical to a real empty-string view. If a caller ever calls
`finish(None)` after having called `append_placeholder()`, those rows will
materialise as `""` instead of `null` — silently wrong.
The current `case_conversion` usage always provides the input null buffer,
so there's no bug today. But please either:
1. Add a `debug_assert!` in `finish()` that `null_buffer.is_some()` when any
placeholder was appended (you can track `self.placeholder_count > 0`), **or**
2. Document in the `append_placeholder` doc-comment that callers **must**
pass a valid null buffer to `finish()`.
Option 1 is safer as more UDFs adopt this builder.
##########
datafusion/functions/src/string/common.rs:
##########
@@ -399,18 +410,28 @@ where
// Values contain non-ASCII.
let item_len = string_array.len();
- let capacity = string_array.value_data().len() + PRE_ALLOC_BYTES;
- let mut builder = GenericStringBuilder::<O>::with_capacity(item_len,
capacity);
+ let capacity = value_data.len() + PRE_ALLOC_BYTES;
+ let nulls = string_array.nulls().cloned();
Review Comment:
The code passes the *input* null buffer directly to the *output* builder:
```rust
let nulls = string_array.nulls().cloned();
// ... write output values for non-null inputs ...
builder.finish(nulls)?
```
This is correct for case conversion (null-in ↔ null-out), but the assumption
isn't stated. Since more UDFs are planned to adopt this builder, a brief
comment here will prevent future callers from reusing the pattern in
non-null-preserving operations:
```rust
// Case conversion is null-preserving: input null ↔ output null,
// so we can pass the input null buffer directly to the output builder.
let nulls = string_array.nulls().cloned();
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]