adamreeve opened a new issue, #47973:
URL: https://github.com/apache/arrow/issues/47973

   ### Describe the bug, including details regarding any error messages, 
version, and platform.
   
   Repro code, implemented as a unit test in a branch on my fork 
(https://github.com/adamreeve/arrow/blob/b3016ddf8c52077eef6c5f61f16f234fb2f2cd40/cpp/src/parquet/column_writer_test.cc#L996-L1043):
   
   ```C++
   TEST(TestColumnWriter, ReproInvalidDictIndex) {
     auto sink = CreateOutputStream();
     auto schema = std::static_pointer_cast<GroupNode>(
         GroupNode::Make("schema", Repetition::REQUIRED,
                         {
                             PrimitiveNode::Make("item", Repetition::REQUIRED, 
Type::INT32),
                         }));
     auto properties =
         WriterProperties::Builder().data_pagesize(1024 * 1024 * 1024)->build();
     auto file_writer = ParquetFileWriter::Open(sink, schema, properties);
     auto rg_writer = file_writer->AppendRowGroup();
   
     constexpr int32_t num_batches = 150;
     constexpr int32_t batch_size = 1'000'000;
     constexpr int32_t unique_count = 200'000;
   
     std::vector<int32_t> values(batch_size, 0);
   
     auto col_writer = 
static_cast<parquet::Int32Writer*>(rg_writer->NextColumn());
     for (int32_t i = 0; i < num_batches; i++) {
       for (int32_t j = 0; j < batch_size; j++) {
         values[j] = j % unique_count;
       }
       col_writer->WriteBatch(batch_size, nullptr, nullptr, values.data());
     }
     file_writer->Close();
   
     ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
   
     auto file_reader = ParquetFileReader::Open(
         std::make_shared<::arrow::io::BufferReader>(buffer), 
default_reader_properties());
     auto metadata = file_reader->metadata();
     ASSERT_EQ(1, metadata->num_row_groups());
     auto row_group_reader = file_reader->RowGroup(0);
     auto col_reader = 
std::static_pointer_cast<Int32Reader>(row_group_reader->Column(0));
   
     constexpr size_t buffer_size = 1024 * 1024;
     values.resize(buffer_size);
   
     size_t levels_read = 0;
     while (levels_read < num_batches * batch_size) {
       int64_t batch_values;
       int64_t batch_levels = col_reader->ReadBatch(buffer_size, nullptr, 
nullptr,
                                                    values.data(), 
&batch_values);
       levels_read += batch_levels;
     }
     std::cout << "Read " << levels_read << " levels" << std::endl;
   }
   ```
   
   In release mode, this fails at `ReadBatch` and outputs:
   ```
   C++ exception with description "Unexpected end of stream" thrown in the test 
body.
   ```
   
   Reading this file with Polars or DuckDB complains about invalid dictionary 
indices:
   ```
   polars.exceptions.ComputeError: parquet: File out of specification: 
Dictionary Index is out-of-bounds
   ```
   ```
   _duckdb.Error: Parquet file is likely corrupted, dictionary offset out of 
range
   ```
   
   If running in a debug build, this fails a debug assertion in 
`RleBitPackedEncoder` here: 
https://github.com/adamreeve/arrow/blob/f83b301c17b3fbef6d320fcee2355336a163bd1a/cpp/src/arrow/util/rle_encoding_internal.h#L1334
   
   Aside: This large page size wasn't used intentionally in the original code 
that triggered this problem, but was a side effect of #47027.
   
   ### Component(s)
   
   C++, Parquet


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to