zanmato1984 opened a new issue, #43202:
URL: https://github.com/apache/arrow/issues/43202

   ### Describe the bug, including details regarding any error messages, 
version, and platform.
   
   When debugging the test in #43046, I accidentally found that the offsets 
within the row table were overflowing (which is much worse than #43046 itself). 
Here is a complete test that exposes this issue:
   
   ```C++
   // Compare columns to rows of row ids larger than 2^31 within a row table.
   // Certain AVX2 instructions may behave unexpectedly causing troubles like 
GH-43046.
   TEST(KeyCompare, LARGE_MEMORY_TEST(CompareColumnsToRowsMany)) {
     if constexpr (sizeof(void*) == 4) {
       GTEST_SKIP() << "Test only works on 64-bit platforms";
     }
   
     // The idea of this case is to create a row table containing one fixed 
length column and
     // one var length column (so the row is hence var length and has offset 
buffer), by
     // appending the same small batch of n rows repeatedly until it has more 
than 2^31 rows.
     // Then compare the last n rows of the row table with the batch.
     constexpr int64_t num_rows_batch = std::numeric_limits<uint16_t>::max();
     constexpr int64_t num_rows_row_table =
         (std::numeric_limits<int32_t>::max() + 1ll) / num_rows_batch * 
num_rows_batch +
         num_rows_batch;
   
     MemoryPool* pool = default_memory_pool();
   
     // The left side columns with num_rows_batch rows.
     std::vector<KeyColumnArray> columns_left;
     ExecBatch batch_left;
     {
       std::vector<Datum> values;
   
       // A fixed length array containing random values.
       ASSERT_OK_AND_ASSIGN(auto value_fixed_length,
                            
::arrow::gen::Random(uint32())->Generate(num_rows_batch));
       values.push_back(std::move(value_fixed_length));
   
       // A var length array containing small var length values ("X").
       ASSERT_OK_AND_ASSIGN(auto value_var_length,
                            
::arrow::gen::Constant(std::make_shared<BinaryScalar>("X"))
                                ->Generate(num_rows_batch));
       values.push_back(std::move(value_var_length));
   
       batch_left = ExecBatch(std::move(values), num_rows_batch);
       ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &columns_left));
     }
   
     // The right side row table with num_rows_row_table rows.
     RowTableImpl row_table_right;
     {
       // Encode the row table with the left columns repeatedly.
       std::vector<KeyColumnMetadata> column_metadatas;
       ASSERT_OK(ColumnMetadatasFromExecBatch(batch_left, &column_metadatas));
       RowTableMetadata table_metadata;
       table_metadata.FromColumnMetadataVector(column_metadatas, 
sizeof(uint64_t),
                                               sizeof(uint64_t));
       ASSERT_OK(row_table_right.Init(pool, table_metadata));
       RowTableImpl row_table_batch;
       ASSERT_OK(row_table_batch.Init(pool, table_metadata));
       std::vector<uint16_t> row_ids(num_rows_batch);
       std::iota(row_ids.begin(), row_ids.end(), 0);
       RowTableEncoder row_encoder;
       row_encoder.Init(column_metadatas, sizeof(uint64_t), sizeof(uint64_t));
       row_encoder.PrepareEncodeSelected(0, num_rows_batch, columns_left);
       ASSERT_OK(row_encoder.EncodeSelected(
           &row_table_batch, static_cast<uint32_t>(num_rows_batch), 
row_ids.data()));
       for (int i = 0; i < num_rows_row_table / num_rows_batch; ++i) {
         ASSERT_OK(row_table_right.AppendSelectionFrom(row_table_batch, 
num_rows_batch,
                                                       
/*source_row_ids=*/NULLPTR));
       }
   
       // The row table must contain an offset buffer.
       ASSERT_NE(row_table_right.offsets(), NULLPTR);
       ASSERT_EQ(row_table_right.length(), num_rows_row_table);
     }
   
     // The rows to compare: all rows in the batch to the last num_rows_batch 
rows of the
     // row table.
     std::vector<uint32_t> row_ids_to_compare(num_rows_batch);
     std::iota(row_ids_to_compare.begin(), row_ids_to_compare.end(),
               num_rows_row_table - num_rows_batch);
   
     TempVectorStack stack;
     ASSERT_OK(
         stack.Init(pool, 
KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows_batch)));
     LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack};
   
     {
       // No selection, output no match row ids.
       uint32_t num_rows_no_match;
       std::vector<uint16_t> row_ids_out(num_rows_batch);
       KeyCompare::CompareColumnsToRows(num_rows_batch, 
/*sel_left_maybe_null=*/NULLPTR,
                                        row_ids_to_compare.data(), &ctx, 
&num_rows_no_match,
                                        row_ids_out.data(), columns_left, 
row_table_right,
                                        /*are_cols_in_encoding_order=*/true,
                                        
/*out_match_bitvector_maybe_null=*/NULLPTR);
       ASSERT_EQ(num_rows_no_match, 0);
     }
   
     {
       // No selection, output match bit vector.
       std::vector<uint8_t> match_bitvector(BytesForBits(num_rows_batch));
       KeyCompare::CompareColumnsToRows(
           num_rows_batch, /*sel_left_maybe_null=*/NULLPTR, 
row_ids_to_compare.data(), &ctx,
           /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, 
columns_left,
           row_table_right,
           /*are_cols_in_encoding_order=*/true, match_bitvector.data());
       ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, 
num_rows_batch),
                 num_rows_batch);
     }
   
     std::vector<uint16_t> selection_left(num_rows_batch);
     std::iota(selection_left.begin(), selection_left.end(), 0);
   
     {
       // With selection, output no match row ids.
       uint32_t num_rows_no_match;
       std::vector<uint16_t> row_ids_out(num_rows_batch);
       KeyCompare::CompareColumnsToRows(num_rows_batch, selection_left.data(),
                                        row_ids_to_compare.data(), &ctx, 
&num_rows_no_match,
                                        row_ids_out.data(), columns_left, 
row_table_right,
                                        /*are_cols_in_encoding_order=*/true,
                                        
/*out_match_bitvector_maybe_null=*/NULLPTR);
       ASSERT_EQ(num_rows_no_match, 0);
     }
   
     {
       // With selection, output match bit vector.
       std::vector<uint8_t> match_bitvector(BytesForBits(num_rows_batch));
       KeyCompare::CompareColumnsToRows(
           num_rows_batch, selection_left.data(), row_ids_to_compare.data(), 
&ctx,
           /*out_num_rows=*/NULLPTR, /*out_sel_left_maybe_same=*/NULLPTR, 
columns_left,
           row_table_right,
           /*are_cols_in_encoding_order=*/true, match_bitvector.data());
       ASSERT_EQ(arrow::internal::CountSetBits(match_bitvector.data(), 0, 
num_rows_batch),
                 num_rows_batch);
     }
   }
   ```
   
   ### Component(s)
   
   C++


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@arrow.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to