This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new e4cdd003d7 GH-46938: [C++] Enhance arrow::ChunkedArray::Equals to
support floating-point comparison when values share the same memory (#47044)
e4cdd003d7 is described below
commit e4cdd003d789afbaa33c006ea665dbf79ff708bd
Author: Arash Andishgar <[email protected]>
AuthorDate: Thu Jul 10 03:54:37 2025 +0330
GH-46938: [C++] Enhance arrow::ChunkedArray::Equals to support
floating-point comparison when values share the same memory (#47044)
### Rationale for this change
As discussed
[here](https://github.com/apache/arrow/issues/46938#issue-3187249840), this is
a minor enhancement to `arrow::ChunkedArray::Equals`.
### What changes are included in this PR?
A minor improvement to `arrow::ChunkedArray::Equals` to handle the case
where chunked arrays share the same underlying memory.
### Are these changes tested?
Yes, I ran the relevant unit tests.
### Are there any user-facing changes?
No.
* GitHub Issue: #46938
Authored-by: Arash Andishgar <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/chunked_array.cc | 19 ++++++++----
cpp/src/arrow/chunked_array_test.cc | 62 ++++++++++++++++++++++---------------
2 files changed, 50 insertions(+), 31 deletions(-)
diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc
index 32578ffd93..0fa174c175 100644
--- a/cpp/src/arrow/chunked_array.cc
+++ b/cpp/src/arrow/chunked_array.cc
@@ -100,24 +100,31 @@ DeviceAllocationTypeSet ChunkedArray::device_types()
const {
}
namespace {
-bool mayHaveNaN(const arrow::DataType& type) {
- if (type.num_fields() == 0) {
- return is_floating(type.id());
+// Check whether the type or any of its children is a float type.
+bool ContainsFloatType(const DataType& type) {
+ if (is_floating(type.id())) {
+ return true;
} else {
+ // Check if any nested field contains a float type.
for (const auto& field : type.fields()) {
- if (mayHaveNaN(*field->type())) {
+ if (ContainsFloatType(*field->type())) {
return true;
}
}
}
+ // No float types are observed
return false;
}
} // namespace
bool ChunkedArray::Equals(const ChunkedArray& other, const EqualOptions& opts)
const {
- if (this == &other && !mayHaveNaN(*type_)) {
- return true;
+ if (this == &other) {
+ if (opts.nans_equal()) {
+ return true;
+ } else if (!ContainsFloatType(*type_)) {
+ return true;
+ }
}
if (length_ != other.length()) {
return false;
diff --git a/cpp/src/arrow/chunked_array_test.cc
b/cpp/src/arrow/chunked_array_test.cc
index 689ef57c59..326eb24d08 100644
--- a/cpp/src/arrow/chunked_array_test.cc
+++ b/cpp/src/arrow/chunked_array_test.cc
@@ -153,33 +153,45 @@ TEST_F(TestChunkedArray, EqualsDifferingMetadata) {
ASSERT_TRUE(left.Equals(right));
}
-TEST_F(TestChunkedArray, EqualsSameAddressWithNaNs) {
- auto chunk_with_nan1 = ArrayFromJSON(float64(), "[0, 1, 2, NaN]");
- auto chunk_without_nan1 = ArrayFromJSON(float64(), "[3, 4, 5]");
- ArrayVector chunks1 = {chunk_with_nan1, chunk_without_nan1};
- ASSERT_OK_AND_ASSIGN(auto chunked_array_with_nan1,
ChunkedArray::Make(chunks1));
- ASSERT_FALSE(chunked_array_with_nan1->Equals(chunked_array_with_nan1));
-
- auto chunk_without_nan2 = ArrayFromJSON(float64(), "[6, 7, 8, 9]");
- ArrayVector chunks2 = {chunk_without_nan1, chunk_without_nan2};
- ASSERT_OK_AND_ASSIGN(auto chunked_array_without_nan1,
ChunkedArray::Make(chunks2));
- ASSERT_TRUE(chunked_array_without_nan1->Equals(chunked_array_without_nan1));
+class TestChunkedArrayEqualsSameAddress : public TestChunkedArray {};
+TEST_F(TestChunkedArrayEqualsSameAddress, NonFloatType) {
auto int32_array = ArrayFromJSON(int32(), "[0, 1, 2]");
- auto float64_array_with_nan = ArrayFromJSON(float64(), "[0, 1, NaN]");
- ArrayVector arrays1 = {int32_array, float64_array_with_nan};
- std::vector<std::string> fieldnames = {"Int32Type", "Float64Type"};
- ASSERT_OK_AND_ASSIGN(auto struct_with_nan, StructArray::Make(arrays1,
fieldnames));
- ArrayVector chunks3 = {struct_with_nan};
- ASSERT_OK_AND_ASSIGN(auto chunked_array_with_nan2,
ChunkedArray::Make(chunks3));
- ASSERT_FALSE(chunked_array_with_nan2->Equals(chunked_array_with_nan2));
-
- auto float64_array_without_nan = ArrayFromJSON(float64(), "[0, 1, 2]");
- ArrayVector arrays2 = {int32_array, float64_array_without_nan};
- ASSERT_OK_AND_ASSIGN(auto struct_without_nan, StructArray::Make(arrays2,
fieldnames));
- ArrayVector chunks4 = {struct_without_nan};
- ASSERT_OK_AND_ASSIGN(auto chunked_array_without_nan2,
ChunkedArray::Make(chunks4));
- ASSERT_TRUE(chunked_array_without_nan2->Equals(chunked_array_without_nan2));
+ ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({int32_array}));
+ ASSERT_TRUE(chunked_array->Equals(chunked_array));
+}
+
+TEST_F(TestChunkedArrayEqualsSameAddress, NestedTypeWithoutFloat) {
+ auto int32_array = ArrayFromJSON(int32(), "[0, 1]");
+ ASSERT_OK_AND_ASSIGN(auto struct_array,
+ StructArray::Make({int32_array}, {"Int32Type"}));
+ ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({struct_array}));
+
+ ASSERT_TRUE(chunked_array->Equals(chunked_array));
+}
+
+TEST_F(TestChunkedArrayEqualsSameAddress, FloatType) {
+ auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, 2.0, NaN]");
+ ASSERT_OK_AND_ASSIGN(auto chunked_array,
ChunkedArray::Make({float64_array}));
+
+ ASSERT_FALSE(chunked_array->Equals(chunked_array));
+
+ // Assert when EqualOptions::nans_equal_ is set
+ ASSERT_TRUE(
+ chunked_array->Equals(chunked_array,
EqualOptions::Defaults().nans_equal(true)));
+}
+
+TEST_F(TestChunkedArrayEqualsSameAddress, NestedTypeWithFloat) {
+ auto float64_array = ArrayFromJSON(float64(), "[0.0, 1.0, NaN]");
+ ASSERT_OK_AND_ASSIGN(auto struct_array,
+ StructArray::Make({float64_array}, {"Float64Type"}));
+ ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({struct_array}));
+
+ ASSERT_FALSE(chunked_array->Equals(chunked_array));
+
+ // Assert when EqualOptions::nans_equal_ is set
+ ASSERT_TRUE(
+ chunked_array->Equals(chunked_array,
EqualOptions::Defaults().nans_equal(true)));
}
TEST_F(TestChunkedArray, ApproxEquals) {