This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 607be64e81 GH-46939: [C++] Add support for shared memory comparison in
arrow::RecordBatch (#47149)
607be64e81 is described below
commit 607be64e81a1788d1d7a679b957481f8734e3ee5
Author: Arash Andishgar <[email protected]>
AuthorDate: Tue Jul 22 15:12:16 2025 +0330
GH-46939: [C++] Add support for shared memory comparison in
arrow::RecordBatch (#47149)
### Rationale for this change
Create a fast path for comparing `arrow::RecordBatch `instances that share
the same memory.
### What changes are included in this PR?
Enable fast comparison for `arrow::RecordBatch `objects backed by the same
memory.
### Are these changes tested?
Yes, I ran the relevant unit tests.
### Are there any user-facing changes?
No.
* GitHub Issue: #46939
Authored-by: Arash Andishgar <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/record_batch.cc | 69 ++++++++++++++++++++++++------
cpp/src/arrow/record_batch_test.cc | 88 ++++++++++++++++++++++++++++++++++++++
2 files changed, 144 insertions(+), 13 deletions(-)
diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc
index 04d6890d39..941fd3f002 100644
--- a/cpp/src/arrow/record_batch.cc
+++ b/cpp/src/arrow/record_batch.cc
@@ -310,19 +310,58 @@ const std::string& RecordBatch::column_name(int i) const {
return schema_->field(i)->name();
}
-bool RecordBatch::Equals(const RecordBatch& other, bool check_metadata,
- const EqualOptions& opts) const {
- if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) {
- return false;
+namespace {
+
+bool ContainFloatType(const std::shared_ptr<DataType>& type) {
+ if (is_floating(type->id())) {
+ return true;
}
- if (!schema_->Equals(*other.schema(), check_metadata)) {
- return false;
+ for (const auto& field : type->fields()) {
+ if (ContainFloatType(field->type())) {
+ return true;
+ }
}
- if (device_type() != other.device_type()) {
+ return false;
+}
+
+bool ContainFloatType(const Schema& schema) {
+ for (auto& field : schema.fields()) {
+ if (ContainFloatType(field->type())) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool CanIgnoreNaNInEquality(const RecordBatch& batch, const EqualOptions&
opts) {
+ if (opts.nans_equal()) {
+ return true;
+ } else if (!ContainFloatType(*batch.schema())) {
+ return true;
+ } else {
return false;
}
+}
+
+} // namespace
+
+bool RecordBatch::Equals(const RecordBatch& other, bool check_metadata,
+ const EqualOptions& opts) const {
+ if (this == &other) {
+ if (CanIgnoreNaNInEquality(*this, opts)) {
+ return true;
+ }
+ } else {
+ if (num_columns() != other.num_columns() || num_rows_ != other.num_rows())
{
+ return false;
+ } else if (!schema_->Equals(*other.schema(), check_metadata)) {
+ return false;
+ } else if (device_type() != other.device_type()) {
+ return false;
+ }
+ }
for (int i = 0; i < num_columns(); ++i) {
if (!column(i)->Equals(other.column(i), opts)) {
@@ -334,12 +373,16 @@ bool RecordBatch::Equals(const RecordBatch& other, bool
check_metadata,
}
bool RecordBatch::ApproxEquals(const RecordBatch& other, const EqualOptions&
opts) const {
- if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) {
- return false;
- }
-
- if (device_type() != other.device_type()) {
- return false;
+ if (this == &other) {
+ if (CanIgnoreNaNInEquality(*this, opts)) {
+ return true;
+ }
+ } else {
+ if (num_columns() != other.num_columns() || num_rows_ != other.num_rows())
{
+ return false;
+ } else if (device_type() != other.device_type()) {
+ return false;
+ }
}
for (int i = 0; i < num_columns(); ++i) {
diff --git a/cpp/src/arrow/record_batch_test.cc
b/cpp/src/arrow/record_batch_test.cc
index fab8137171..156d083828 100644
--- a/cpp/src/arrow/record_batch_test.cc
+++ b/cpp/src/arrow/record_batch_test.cc
@@ -142,6 +142,94 @@ TEST_F(TestRecordBatch, ApproxEqualOptions) {
EXPECT_TRUE(b1->ApproxEquals(*b2, options));
}
+class TestRecordBatchEqualsSameAddress : public TestRecordBatch {};
+
+TEST_F(TestRecordBatchEqualsSameAddress, NonFloatType) {
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", int64());
+
+ auto schema = ::arrow::schema({f0, f1});
+
+ auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]");
+ auto a1 = ArrayFromJSON(f1->type(), "[0, 1, 2]");
+
+ auto b0 = RecordBatch::Make(schema, 3, {a0, a1});
+ auto b1 = b0;
+
+ auto options = EqualOptions::Defaults();
+
+ ASSERT_TRUE(b0->Equals(*b1, true, options));
+ ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true)));
+
+ ASSERT_TRUE(b0->ApproxEquals(*b1, options));
+ ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true)));
+}
+
+TEST_F(TestRecordBatchEqualsSameAddress, NestedTypesWithoutFloatType) {
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", struct_({{"f2", int64()}, {"f3", int8()}}));
+
+ auto schema = ::arrow::schema({f0, f1});
+
+ auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]");
+ auto a1 = ArrayFromJSON(
+ f1->type(), R"([{"f2": 1, "f3": 4}, {"f2": 2, "f3": 5}, {"f2":3, "f3":
6}])");
+
+ auto b0 = RecordBatch::Make(schema, 3, {a0, a1});
+ auto b1 = b0;
+
+ auto options = EqualOptions::Defaults();
+
+ ASSERT_TRUE(b0->Equals(*b1, true, options));
+ ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true)));
+
+ ASSERT_TRUE(b0->ApproxEquals(*b1, options));
+ ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true)));
+}
+
+TEST_F(TestRecordBatchEqualsSameAddress, FloatType) {
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", float64());
+
+ auto schema = ::arrow::schema({f0, f1});
+
+ auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]");
+ auto a1 = ArrayFromJSON(f1->type(), "[0.0, 1.0, 2.0, NaN]");
+
+ auto b0 = RecordBatch::Make(schema, 3, {a0, a1});
+ auto b1 = b0;
+
+ auto options = EqualOptions::Defaults();
+
+ ASSERT_FALSE(b0->Equals(*b1, true, options));
+ ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true)));
+
+ ASSERT_FALSE(b0->ApproxEquals(*b1, options));
+ ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true)));
+}
+
+TEST_F(TestRecordBatchEqualsSameAddress, NestedTypesWithFloatType) {
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", struct_({{"f2", int64()}, {"f3", float32()}}));
+
+ auto schema = ::arrow::schema({f0, f1});
+
+ auto a0 = ArrayFromJSON(f0->type(), "[0, 1, 2]");
+ auto a1 = ArrayFromJSON(
+ f1->type(), R"([{"f2": 1, "f3": 4.0}, {"f2": 2, "f3": 4.0}, {"f2":3,
"f3": NaN}])");
+
+ auto b0 = RecordBatch::Make(schema, 3, {a0, a1});
+ auto b1 = b0;
+
+ auto options = EqualOptions::Defaults();
+
+ ASSERT_FALSE(b0->Equals(*b1, true, options));
+ ASSERT_TRUE(b0->Equals(*b1, true, options.nans_equal(true)));
+
+ ASSERT_FALSE(b0->ApproxEquals(*b1, options));
+ ASSERT_TRUE(b0->ApproxEquals(*b1, options.nans_equal(true)));
+}
+
TEST_F(TestRecordBatch, Validate) {
const int length = 10;