(arrow) 04/04: GH-47740: [C++][Parquet] Fix undefined behavior when reading invalid Parquet data (#47741)

raulcd Fri, 17 Oct 2025 22:39:52 -0700

This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch maint-22.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git


commit 7b4dd0511162ca985365fbde9e633ca33de57215
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Oct 8 08:43:12 2025 +0200

    GH-47740: [C++][Parquet] Fix undefined behavior when reading invalid 
Parquet data (#47741)
    
    ### Rationale for this change
    
    Fix issues found by OSS-Fuzz when invalid Parquet data is fed to the 
Parquet reader:
    * https://issues.oss-fuzz.com/issues/447262173
    * https://issues.oss-fuzz.com/issues/447480433
    * https://issues.oss-fuzz.com/issues/447490896
    * https://issues.oss-fuzz.com/issues/447693724
    * https://issues.oss-fuzz.com/issues/447693728
    * https://issues.oss-fuzz.com/issues/449498800
    
    ### Are these changes tested?
    
    Yes, using the updated fuzz regression files from 
https://github.com/apache/arrow-testing/pull/115
    
    ### Are there any user-facing changes?
    
    No.
    
    **This PR contains a "Critical Fix".** (If the changes fix either (a) a 
security vulnerability, (b) a bug that caused incorrect or invalid data to be 
produced, or (c) a bug that causes a crash (even when the API contract is 
upheld), please provide explanation. If not, you can remove this.)
    
    * GitHub Issue: #47740
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/CMakePresets.json                      |  3 ++-
 cpp/src/arrow/util/rle_encoding_internal.h | 16 ++++++++++------
 cpp/src/parquet/decoder.cc                 |  7 +++++--
 testing                                    |  2 +-
 4 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json
index c9e2444389..0c3f85d091 100644
--- a/cpp/CMakePresets.json
+++ b/cpp/CMakePresets.json
@@ -444,7 +444,8 @@
         "CMAKE_CXX_COMPILER": "clang++",
         "ARROW_IPC": "ON",
         "ARROW_PARQUET": "ON",
-        "ARROW_FUZZING": "ON"
+        "ARROW_FUZZING": "ON",
+        "ARROW_WITH_SNAPPY": "ON"
       }
     },
     {
diff --git a/cpp/src/arrow/util/rle_encoding_internal.h 
b/cpp/src/arrow/util/rle_encoding_internal.h
index c231c9a63e..a7917483bb 100644
--- a/cpp/src/arrow/util/rle_encoding_internal.h
+++ b/cpp/src/arrow/util/rle_encoding_internal.h
@@ -657,13 +657,14 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const
   const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, 
&run_len_type);
 
   if (ARROW_PREDICT_FALSE(header_bytes == 0)) {
-    // Malfomrmed LEB128 data
+    // Malformed LEB128 data
     return {0, ControlFlow::Break};
   }
 
   const bool is_bit_packed = run_len_type & 1;
   const uint32_t count = run_len_type >> 1;
   if (is_bit_packed) {
+    // Bit-packed run
     constexpr auto kMaxCount = 
bit_util::CeilDiv(internal::max_size_for_v<rle_size_t>, 8);
     if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) {
       // Illegal number of encoded values
@@ -672,17 +673,21 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const
 
     ARROW_DCHECK_LT(static_cast<uint64_t>(count) * 8,
                     internal::max_size_for_v<rle_size_t>);
+    // Count Already divided by 8 for byte size calculations
+    const auto bytes_read = header_bytes + static_cast<int64_t>(count) * 
value_bit_width_;
+    if (ARROW_PREDICT_FALSE(bytes_read > data_size_)) {
+      // Bit-packed run would overflow data buffer
+      return {0, ControlFlow::Break};
+    }
     const auto values_count = static_cast<rle_size_t>(count * 8);
-    // Count Already divided by 8
-    const auto bytes_read =
-        header_bytes + static_cast<rle_size_t>(count) * value_bit_width_;
 
     auto control = handler.OnBitPackedRun(
         BitPackedRun(data_ + header_bytes, values_count, value_bit_width_));
 
-    return {bytes_read, control};
+    return {static_cast<rle_size_t>(bytes_read), control};
   }
 
+  // RLE run
   if (ARROW_PREDICT_FALSE(count == 0)) {
     // Illegal number of encoded values
     return {0, ControlFlow::Break};
@@ -1079,7 +1084,6 @@ auto RleBitPackedDecoder<T>::GetSpaced(Converter 
converter,
   // There may be remaining null if they are not greedily filled by either 
decoder calls
   check_and_handle_fully_null_remaining();
 
-  ARROW_DCHECK(batch.is_done() || exhausted());
   return batch.total_read();
 }
 
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index 46d1c201e9..b6d7966562 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -2082,9 +2082,12 @@ class DeltaByteArrayDecoderImpl : public 
TypedDecoderImpl<DType> {
                           int64_t valid_bits_offset,
                           typename EncodingTraits<DType>::Accumulator* out,
                           int* out_num_values) {
-    std::vector<ByteArray> values(num_values);
+    std::vector<ByteArray> values(num_values - null_count);
     const int num_valid_values = GetInternal(values.data(), num_values - 
null_count);
-    DCHECK_EQ(num_values - null_count, num_valid_values);
+    if (ARROW_PREDICT_FALSE(num_values - null_count != num_valid_values)) {
+      throw ParquetException("Expected to decode ", num_values - null_count,
+                             " values, but decoded ", num_valid_values, " 
values.");
+    }
 
     auto visit_binary_helper = [&](auto* helper) {
       auto values_ptr = reinterpret_cast<const ByteArray*>(values.data());
diff --git a/testing b/testing
index 6a7b02fac9..abf6d7ebde 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 6a7b02fac93d8addbcdbb213264e58bfdc3068e4
+Subproject commit abf6d7ebde7ab70b541c51859dad2bef71a0151e

(arrow) 04/04: GH-47740: [C++][Parquet] Fix undefined behavior when reading invalid Parquet data (#47741)

Reply via email to