This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 6aae76b0f7 perf: optimize hex decoding in json (1.8x faster in 
binary-heavy) (#9091)
6aae76b0f7 is described below

commit 6aae76b0f7fbbfdd27b1249629f44c531b5b0f5e
Author: Alex Huang <[email protected]>
AuthorDate: Sat Jan 10 22:01:06 2026 +0200

    perf: optimize hex decoding in json (1.8x faster in binary-heavy) (#9091)
    
    # Which issue does this PR close?
    
    <!--
    We generally require a GitHub issue to be filed for all bug fixes and
    enhancements and this helps us generate change logs for our releases.
    You can link an issue to this PR using the GitHub syntax.
    -->
    
    - Closes #NNN.
    
    # Rationale for this change
    
    Improve JSON binary decoding performance by avoiding per-value
    allocations and enabling direct hex decoding into builders.
    
    <!--
    Why are you proposing this change? If this is already explained clearly
    in the issue then this section is not needed.
    Explaining clearly why changes are proposed helps reviewers understand
    your changes and offer better suggestions for fixes.
    -->
    
    # What changes are included in this PR?
    
    Optimized binary hex decoding paths to reduce allocations and improve
    throughput.
    
    
    ```
    decode_binary_hex_json  time:   [3.6780 ms 3.6953 ms 3.7150 ms]
                            change: [−61.051% −60.818% −60.565%] (p = 0.00 < 
0.05)
                            Performance has improved.
    Found 8 outliers among 100 measurements (8.00%)
      5 (5.00%) high mild
      3 (3.00%) high severe
    
    decode_fixed_binary_hex_json
                            time:   [4.0404 ms 4.1400 ms 4.2901 ms]
                            change: [−56.149% −55.040% −53.330%] (p = 0.00 < 
0.05)
                            Performance has improved.
    Found 19 outliers among 100 measurements (19.00%)
      7 (7.00%) high mild
      12 (12.00%) high severe
    
    decode_binary_view_hex_json
                            time:   [4.3731 ms 4.4242 ms 4.4767 ms]
                            change: [−53.305% −52.771% −52.239%] (p = 0.00 < 
0.05)
                            Performance has improved.
    Found 1 outliers among 100 measurements (1.00%)
      1 (1.00%) high mild
    ```
    
    <!--
    There is no need to duplicate the description in the issue here but it
    is sometimes worth providing a summary of the individual changes in this
    PR.
    -->
    
    # Are these changes tested?
    Yes
    
    <!--
    We typically require tests for all PRs in order to:
    1. Prevent the code from being accidentally broken by subsequent changes
    2. Serve as another way to document the expected behavior of the code
    
    If tests are not included in your PR, please explain why (for example,
    are they covered by existing tests)?
    -->
    
    # Are there any user-facing changes?
    
    No
    
    <!--
    If there are user-facing changes then we may require documentation to be
    updated before approving the PR.
    
    If there are any breaking changes to public APIs, please call them out.
    -->
---
 arrow-json/src/reader/binary_array.rs | 152 ++++++++++++++++++++++++++++------
 1 file changed, 128 insertions(+), 24 deletions(-)

diff --git a/arrow-json/src/reader/binary_array.rs 
b/arrow-json/src/reader/binary_array.rs
index a71569d57f..712eb6bb4d 100644
--- a/arrow-json/src/reader/binary_array.rs
+++ b/arrow-json/src/reader/binary_array.rs
@@ -15,30 +15,70 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow_array::builder::{
-    BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder, 
GenericStringBuilder,
-};
+use arrow_array::builder::{BinaryViewBuilder, FixedSizeBinaryBuilder, 
GenericBinaryBuilder};
 use arrow_array::{Array, GenericStringArray, OffsetSizeTrait};
 use arrow_data::ArrayData;
 use arrow_schema::ArrowError;
+use std::io::Write;
 use std::marker::PhantomData;
 
 use crate::reader::ArrayDecoder;
 use crate::reader::tape::{Tape, TapeElement};
 
-/// Decode a hex-encoded string into bytes
-fn decode_hex_string(hex_string: &str) -> Result<Vec<u8>, ArrowError> {
-    let mut decoded = Vec::with_capacity(hex_string.len() / 2);
-    for substr in hex_string.as_bytes().chunks(2) {
-        let str = std::str::from_utf8(substr).map_err(|e| {
-            ArrowError::JsonError(format!("invalid utf8 in hex encoded binary 
data: {e}"))
-        })?;
-        let byte = u8::from_str_radix(str, 16).map_err(|e| {
-            ArrowError::JsonError(format!("invalid hex encoding in binary 
data: {e}"))
-        })?;
-        decoded.push(byte);
+#[inline]
+fn decode_hex_digit(byte: u8) -> Option<u8> {
+    match byte {
+        b'0'..=b'9' => Some(byte - b'0'),
+        b'a'..=b'f' => Some(byte - b'a' + 10),
+        b'A'..=b'F' => Some(byte - b'A' + 10),
+        _ => None,
     }
-    Ok(decoded)
+}
+
+fn invalid_hex_error_at(index: usize, byte: u8) -> ArrowError {
+    ArrowError::JsonError(format!(
+        "invalid hex encoding in binary data: invalid digit 0x{byte:02x} at 
position {index}"
+    ))
+}
+
+fn decode_hex_to_writer<W: Write>(hex_string: &str, writer: &mut W) -> 
Result<(), ArrowError> {
+    let bytes = hex_string.as_bytes();
+    let mut iter = bytes.chunks_exact(2);
+    let mut buffer = [0u8; 64];
+    let mut buffered = 0;
+
+    for (pair_index, pair) in (&mut iter).enumerate() {
+        let base = pair_index * 2;
+        let high = decode_hex_digit(pair[0]).ok_or_else(|| 
invalid_hex_error_at(base, pair[0]))?;
+        let low =
+            decode_hex_digit(pair[1]).ok_or_else(|| invalid_hex_error_at(base 
+ 1, pair[1]))?;
+        buffer[buffered] = (high << 4) | low;
+        buffered += 1;
+
+        if buffered == buffer.len() {
+            writer
+                .write_all(&buffer)
+                .map_err(|e| ArrowError::JsonError(format!("failed to write 
binary data: {e}")))?;
+            buffered = 0;
+        }
+    }
+
+    let remainder = iter.remainder();
+    if !remainder.is_empty() {
+        let index = (bytes.len() / 2) * 2;
+        let low = decode_hex_digit(remainder[0])
+            .ok_or_else(|| invalid_hex_error_at(index, remainder[0]))?;
+        buffer[buffered] = low;
+        buffered += 1;
+    }
+
+    if buffered > 0 {
+        writer
+            .write_all(&buffer[..buffered])
+            .map_err(|e| ArrowError::JsonError(format!("failed to write binary 
data: {e}")))?;
+    }
+
+    Ok(())
 }
 
 #[derive(Default)]
@@ -59,14 +99,14 @@ impl<O: OffsetSizeTrait> ArrayDecoder for 
BinaryArrayDecoder<O> {
 
         let mut builder = GenericBinaryBuilder::<O>::with_capacity(pos.len(), 
data_capacity);
 
-        GenericStringBuilder::<O>::with_capacity(pos.len(), data_capacity);
-
         for p in pos {
             match tape.get(*p) {
                 TapeElement::String(idx) => {
                     let string = tape.get_string(idx);
-                    let decoded = decode_hex_string(string)?;
-                    builder.append_value(&decoded);
+                    // Decode directly into the builder for performance. If 
decoding fails,
+                    // the error is terminal and the builder is discarded by 
the caller.
+                    decode_hex_to_writer(string, &mut builder)?;
+                    builder.append_value(b"");
                 }
                 TapeElement::Null => builder.append_null(),
                 _ => unreachable!(),
@@ -91,13 +131,17 @@ impl FixedSizeBinaryArrayDecoder {
 impl ArrayDecoder for FixedSizeBinaryArrayDecoder {
     fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, 
ArrowError> {
         let mut builder = FixedSizeBinaryBuilder::with_capacity(pos.len(), 
self.len);
+        // Preallocate for the decoded byte width (FixedSizeBinary len), not 
the hex string length.
+        let mut scratch = Vec::with_capacity(self.len as usize);
 
         for p in pos {
             match tape.get(*p) {
                 TapeElement::String(idx) => {
                     let string = tape.get_string(idx);
-                    let decoded = decode_hex_string(string)?;
-                    builder.append_value(&decoded)?;
+                    scratch.clear();
+                    scratch.reserve(string.len().div_ceil(2));
+                    decode_hex_to_writer(string, &mut scratch)?;
+                    builder.append_value(&scratch)?;
                 }
                 TapeElement::Null => builder.append_null(),
                 _ => unreachable!(),
@@ -115,13 +159,16 @@ impl ArrayDecoder for BinaryViewDecoder {
     fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, 
ArrowError> {
         let data_capacity = estimate_data_capacity(tape, pos)?;
         let mut builder = BinaryViewBuilder::with_capacity(data_capacity);
+        let mut scratch = Vec::new();
 
         for p in pos {
             match tape.get(*p) {
                 TapeElement::String(idx) => {
                     let string = tape.get_string(idx);
-                    let decoded = decode_hex_string(string)?;
-                    builder.append_value(&decoded);
+                    scratch.clear();
+                    scratch.reserve(string.len().div_ceil(2));
+                    decode_hex_to_writer(string, &mut scratch)?;
+                    builder.append_value(&scratch);
                 }
                 TapeElement::Null => builder.append_null(),
                 _ => unreachable!(),
@@ -139,7 +186,7 @@ fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> 
Result<usize, ArrowEr
             TapeElement::String(idx) => {
                 let string_len = tape.get_string(idx).len();
                 // two hex characters represent one byte
-                let decoded_len = string_len / 2;
+                let decoded_len = string_len.div_ceil(2);
                 data_capacity += decoded_len;
             }
             TapeElement::Null => {}
@@ -150,3 +197,60 @@ fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> 
Result<usize, ArrowEr
     }
     Ok(data_capacity)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ReaderBuilder;
+    use arrow_schema::{DataType, Field};
+    use std::io::Cursor;
+
+    #[test]
+    fn test_decode_hex_to_writer_empty() {
+        let mut out = Vec::new();
+        decode_hex_to_writer("", &mut out).unwrap();
+        assert!(out.is_empty());
+    }
+
+    #[test]
+    fn test_decode_hex_to_writer_odd_length() {
+        let mut out = Vec::new();
+        decode_hex_to_writer("0f0", &mut out).unwrap();
+        assert_eq!(out, vec![0x0f, 0x00]);
+
+        out.clear();
+        decode_hex_to_writer("a", &mut out).unwrap();
+        assert_eq!(out, vec![0x0a]);
+    }
+
+    #[test]
+    fn test_decode_hex_to_writer_invalid() {
+        let mut out = Vec::new();
+        let err = decode_hex_to_writer("0f0g", &mut out).unwrap_err();
+        match err {
+            ArrowError::JsonError(msg) => {
+                assert!(msg.contains("invalid hex encoding in binary data"));
+                assert!(msg.contains("position 3"));
+            }
+            _ => panic!("expected JsonError"),
+        }
+    }
+
+    #[test]
+    fn test_binary_reader_invalid_hex_is_terminal() {
+        let field = Field::new("item", DataType::Binary, false);
+        let data = b"\"0f0g\"\n\"0f00\"\n";
+        let mut reader = ReaderBuilder::new_with_field(field)
+            .build(Cursor::new(data))
+            .unwrap();
+
+        let err = reader.next().unwrap().unwrap_err().to_string();
+        assert!(err.contains("invalid hex encoding in binary data"));
+
+        match reader.next() {
+            None => {}
+            Some(Err(_)) => {}
+            Some(Ok(_)) => panic!("expected terminal error after invalid hex"),
+        }
+    }
+}

Reply via email to