neilconway commented on code in PR #22029:
URL: https://github.com/apache/datafusion/pull/22029#discussion_r3235245367
##########
datafusion/functions/src/strings.rs:
##########
@@ -673,18 +911,123 @@ impl StringViewArrayBuilder {
}
}
+/// [`StringWriter`] for [`StringViewArrayBuilder`].
+///
+/// The writer accumulates the first up-to-12 bytes of a row in a stack
+/// buffer; if the row stays inline-sized, it never touches the data block.
+/// On the first write that would exceed 12 bytes, the stack buffer is
+/// spilled into the builder's in-progress block and subsequent writes go
+/// directly there.
+pub(crate) struct StringViewWriter<'a> {
+ inline_buf: [u8; 12],
+ inline_len: u8,
+ /// `None` while the row fits inline; becomes `Some(start)` (offset of
+ /// the row's first byte in `in_progress`) at first spill.
+ spill_cursor: Option<usize>,
+ builder: &'a mut StringViewArrayBuilder,
+}
+
+impl StringWriter for StringViewWriter<'_> {
+ #[inline]
+ fn write_str(&mut self, s: &str) {
+ let bytes = s.as_bytes();
+ match self.spill_cursor {
+ None => {
+ let inline_len = self.inline_len as usize;
+ let new_len = inline_len + bytes.len();
+ if new_len <= 12 {
+
self.inline_buf[inline_len..new_len].copy_from_slice(bytes);
+ self.inline_len = new_len as u8;
+ } else {
+ // First spill of this row: reserve capacity (which may
+ // flush the current block — safe, no row-data is in it
+ // yet for this row), copy the buffered prefix, then
+ // write the new bytes.
+ self.builder.ensure_long_capacity(new_len as u32);
+ let cursor = self.builder.in_progress.len();
+ self.builder
+ .in_progress
+ .extend_from_slice(&self.inline_buf[..inline_len]);
+ self.builder.in_progress.extend_from_slice(bytes);
+ self.spill_cursor = Some(cursor);
+ }
+ }
+ Some(_) => {
+ self.builder.in_progress.extend_from_slice(bytes);
+ }
+ }
+ }
+
+ #[inline]
+ fn write_char(&mut self, c: char) {
+ let len = c.len_utf8();
+ match self.spill_cursor {
+ None => {
+ let inline_len = self.inline_len as usize;
+ let new_len = inline_len + len;
+ if new_len <= 12 {
+ c.encode_utf8(&mut self.inline_buf[inline_len..new_len]);
+ self.inline_len = new_len as u8;
+ } else {
+ self.builder.ensure_long_capacity(new_len as u32);
+ let cursor = self.builder.in_progress.len();
+ self.builder
+ .in_progress
+ .extend_from_slice(&self.inline_buf[..inline_len]);
+ push_char_to_vec(&mut self.builder.in_progress, c);
+ self.spill_cursor = Some(cursor);
+ }
+ }
+ Some(_) => {
+ push_char_to_vec(&mut self.builder.in_progress, c);
+ }
+ }
+ }
+}
+
+#[inline]
+fn push_char_to_vec(v: &mut Vec<u8>, c: char) {
+ let len = c.len_utf8();
+ let old_len = v.len();
+ v.reserve(len);
+ // SAFETY: we reserved `len` bytes above, write valid UTF-8 into those
+ // bytes, then update the initialized length to include them.
+ unsafe {
+ let dst = v.as_mut_ptr().add(old_len);
+ if len == 1 {
+ *dst = c as u8;
+ } else {
+ c.encode_utf8(std::slice::from_raw_parts_mut(dst, len));
+ }
+ v.set_len(old_len + len);
+ }
+}
+
/// Trait abstracting over the bulk-NULL string array builders.
///
/// Similar to Arrow's `StringLikeArrayBuilder`, this allows generic dispatch
/// over the three string array types (Utf8, LargeUtf8, Utf8View) when the
/// function body is uniform across them.
pub(crate) trait BulkNullStringArrayBuilder {
+ /// Per-builder concrete writer type, exposed as a GAT so generic callers
+ /// can use the inherent (non-`dyn`) writer methods without vtable
+ /// dispatch.
+ type Writer<'a>: StringWriter
+ where
+ Self: 'a;
+
fn append_value(&mut self, value: &str);
Review Comment:
Yep, makes sense -- I consolidated most of the docs in the trait and added
back-links.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]