waruto210 opened a new issue, #48072:
URL: https://github.com/apache/arrow/issues/48072
### Describe the bug, including details regarding any error messages,
version, and platform.
branch: `main`
commit sha: `fcfb33d0b6f7d368bcd00bb661d35c71885c5140`
```c++
/// file: cpp/src/arrow/acero/unmaterialized_table_internal.h:175
template <class Type, class Builder = typename
TypeTraits<Type>::BuilderType>
enable_if_boolean<Type, Status> static BuilderAppend(
Builder& builder, const std::shared_ptr<ArrayData>& source, uint64_t
row) {
if (source->IsNull(row)) {
builder.UnsafeAppendNull();
return Status::OK();
}
builder.UnsafeAppend(bit_util::GetBit(source->template
GetValues<uint8_t>(1), row));
return Status::OK();
}
/// cpp/src/arrow/array/data.h:271
template <typename T>
inline const T* GetValues(int i) const {
return GetValues<T>(i, offset);
}
```
Here, uint8_t type is used for GetValues, and the underlying buffer applies
a logical offset. However, since the boolean type is actually stored bitwise,
this may lead to a segmentation fault or incorrect sorting results.
I added the following unit test, which can verify that the result of
sortd_merge is incorrect.
```c++
TEST(SortedMergeNode, TestSortedMergeTwoInputsWithBool) {
const int64_t row_count = (16 << 10); // 16k rows per input
// Create schema with int column A and bool column B
auto test_schema = arrow::schema(
{arrow::field("col_a", arrow::int32()), arrow::field("col_b",
arrow::boolean())});
// Helper lambda to create table with specific pattern
auto create_test_scanner = [&](int64_t cnt, int offset) ->
arrow::Result<Declaration> {
// Create column A (int) - values from offset to offset+cnt-1
arrow::Int32Builder col_a_builder;
std::vector<int32_t> col_a_values;
col_a_values.reserve(cnt);
for (int64_t i = 0; i < cnt; ++i) {
col_a_values.push_back(static_cast<int32_t>(offset + i));
}
ARROW_RETURN_NOT_OK(col_a_builder.AppendValues(col_a_values));
std::shared_ptr<arrow::Array> col_a_arr;
ARROW_RETURN_NOT_OK(col_a_builder.Finish(&col_a_arr));
// Create column B (bool) - pattern: true if col_a % 5 == 0, false
otherwise
arrow::BooleanBuilder col_b_builder;
for (int64_t i = 0; i < cnt; ++i) {
int32_t a_value = offset + i;
bool b_value = (a_value % 5 == 0);
ARROW_RETURN_NOT_OK(col_b_builder.Append(b_value));
}
std::shared_ptr<arrow::Array> col_b_arr;
ARROW_RETURN_NOT_OK(col_b_builder.Finish(&col_b_arr));
auto table = arrow::Table::Make(test_schema, {col_a_arr, col_b_arr});
auto table_source =
Declaration("table_source", TableSourceNodeOptions(table, row_count
/ 16));
return table_source;
};
ASSERT_OK_AND_ASSIGN(auto source1, create_test_scanner(row_count, 0));
ASSERT_OK_AND_ASSIGN(auto source2, create_test_scanner(row_count, 8192));
// Create sorted merge by column A
auto ops =
OrderByNodeOptions(compute::Ordering({compute::SortKey("col_a")}));
Declaration sorted_merge{"sorted_merge", {source1, source2}, ops};
// Execute plan and collect result
ASSERT_OK_AND_ASSIGN(auto result_table,
arrow::acero::DeclarationToTable(sorted_merge,
false));
ASSERT_TRUE(result_table != nullptr);
// Verify results
auto col_a = result_table->GetColumnByName("col_a");
auto col_b = result_table->GetColumnByName("col_b");
ASSERT_TRUE(col_a != nullptr);
ASSERT_TRUE(col_b != nullptr);
// Verify sorting and bool values
int32_t last_a_value = std::numeric_limits<int32_t>::min();
int64_t total_rows_checked = 0;
int64_t true_count = 0;
int64_t false_count = 0;
for (int i = 0; i < col_a->num_chunks(); i++) {
auto a_chunk =
std::static_pointer_cast<arrow::Int32Array>(col_a->chunk(i));
auto b_chunk =
std::static_pointer_cast<arrow::BooleanArray>(col_b->chunk(i));
ASSERT_EQ(a_chunk->length(), b_chunk->length())
<< "Column A and B must have same length in chunk " << i;
for (int64_t j = 0; j < a_chunk->length(); j++) {
ASSERT_FALSE(a_chunk->IsNull(j)) << "Column A should not have null
values";
ASSERT_FALSE(b_chunk->IsNull(j)) << "Column B should not have null
values";
int32_t a_value = a_chunk->Value(j);
bool b_value = b_chunk->Value(j);
// Verify sorting by column A
ASSERT_GE(a_value, last_a_value)
<< "Values not sorted at chunk " << i << ", row " << j
<< ": current=" << a_value << ", last=" << last_a_value;
last_a_value = a_value;
// Verify bool value correctness: should be true if a_value % 3 == 0
bool expected_b_value = (a_value % 5 == 0);
ASSERT_EQ(b_value, expected_b_value)
<< "Bool value incorrect at chunk " << i << ", row " << j
<< ": col_a=" << a_value << ", col_b=" << b_value
<< ", expected=" << expected_b_value;
if (b_value) {
true_count++;
} else {
false_count++;
}
total_rows_checked++;
}
}
ASSERT_EQ(last_a_value, 24575);
ASSERT_EQ(total_rows_checked, row_count * 2)
<< "Expected " << row_count << " unique rows after merge";
}
```
### Component(s)
C++
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]