This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new fb70742e871 branch-2.1: [Fix](field) Fix potential memory leak and
wrong binary reading about JsonbField (#50174) (#52693)
fb70742e871 is described below
commit fb70742e8713f974bbc5f6b846e712312fcb176c
Author: zclllyybb <[email protected]>
AuthorDate: Thu Jul 3 12:38:37 2025 +0800
branch-2.1: [Fix](field) Fix potential memory leak and wrong binary reading
about JsonbField (#50174) (#52693)
pick https://github.com/apache/doris/pull/50174
---
be/src/runtime/jsonb_value.h | 10 +-
be/src/vec/columns/column_string.h | 2 +
be/src/vec/common/string_buffer.hpp | 3 +
be/src/vec/core/field.cpp | 7 +-
be/src/vec/core/field.h | 62 ++++++------
be/src/vec/data_types/data_type_jsonb.h | 22 ++---
be/src/vec/io/io_helper.h | 12 ++-
be/src/vec/io/var_int.h | 35 +++++--
be/test/vec/core/field_test.cpp | 166 ++++++++++++++++++++++++++++++++
9 files changed, 253 insertions(+), 66 deletions(-)
diff --git a/be/src/runtime/jsonb_value.h b/be/src/runtime/jsonb_value.h
index 1df9469e172..951b91b2f34 100644
--- a/be/src/runtime/jsonb_value.h
+++ b/be/src/runtime/jsonb_value.h
@@ -42,8 +42,8 @@ struct JsonBinaryValue {
size_t len = 0;
JsonbParser parser;
- JsonBinaryValue() : ptr(nullptr), len(0) {}
- JsonBinaryValue(char* ptr, int len) {
+ JsonBinaryValue() = default;
+ JsonBinaryValue(char* ptr, size_t len) {
static_cast<void>(from_json_string(const_cast<const char*>(ptr), len));
}
JsonBinaryValue(const std::string& s) {
@@ -51,11 +51,11 @@ struct JsonBinaryValue {
}
JsonBinaryValue(const char* ptr, int len) {
static_cast<void>(from_json_string(ptr, len)); }
- const char* value() { return ptr; }
+ const char* value() const { return ptr; }
- size_t size() { return len; }
+ size_t size() const { return len; }
- void replace(char* ptr, int len) {
+ void replace(const char* ptr, int len) {
this->ptr = ptr;
this->len = len;
}
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index 3c912043f1c..716b9cb4a16 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -167,6 +167,8 @@ public:
check_chars_length(new_size, old_size + 1);
chars.resize(new_size);
+ DCHECK(s.data != nullptr);
+ DCHECK(chars.data() != nullptr);
memcpy(chars.data() + old_size, s.data, size_to_append);
offsets.push_back(new_size);
sanity_check_simple();
diff --git a/be/src/vec/common/string_buffer.hpp
b/be/src/vec/common/string_buffer.hpp
index fa759f95e24..f03fb5ce5f2 100644
--- a/be/src/vec/common/string_buffer.hpp
+++ b/be/src/vec/common/string_buffer.hpp
@@ -25,6 +25,8 @@
namespace doris::vectorized {
+// store and commit data. only after commit the data is effective on its'
base(ColumnString)
+// everytime commit, the _data add one row.
class BufferWritable final {
public:
explicit BufferWritable(ColumnString& vector)
@@ -63,6 +65,7 @@ private:
using VectorBufferWriter = BufferWritable;
using BufferWriter = BufferWritable;
+// There is consumption of the buffer in the read method.
class BufferReadable {
public:
explicit BufferReadable(StringRef& ref) : _data(ref.data) {}
diff --git a/be/src/vec/core/field.cpp b/be/src/vec/core/field.cpp
index ac92605007d..d45a1a719ac 100644
--- a/be/src/vec/core/field.cpp
+++ b/be/src/vec/core/field.cpp
@@ -26,14 +26,9 @@
#include "vec/io/io_helper.h"
#include "vec/io/var_int.h"
-namespace doris {
-namespace vectorized {
+namespace doris::vectorized {
class BufferReadable;
class BufferWritable;
-} // namespace vectorized
-} // namespace doris
-
-namespace doris::vectorized {
void read_binary(Array& x, BufferReadable& buf) {
size_t size;
diff --git a/be/src/vec/core/field.h b/be/src/vec/core/field.h
index 922f9abb13e..cceb71e53ee 100644
--- a/be/src/vec/core/field.h
+++ b/be/src/vec/core/field.h
@@ -156,59 +156,61 @@ DEFINE_FIELD_VECTOR(Map);
using VariantMap = std::map<PathInData, Field>;
+//TODO: rethink if we really need this? it only save one pointer from
std::string
+// not POD type so could only use read/write_json_binary instead of
read/write_binary
class JsonbField {
public:
JsonbField() = default;
+ ~JsonbField() = default; // unique_ptr will handle cleanup automatically
- JsonbField(const char* ptr, uint32_t len) : size(len) {
- data = new char[size];
+ JsonbField(const char* ptr, size_t len) : size(len) {
+ data = std::make_unique<char[]>(size);
if (!data) {
LOG(FATAL) << "new data buffer failed, size: " << size;
}
- memcpy(data, ptr, size);
+ if (size > 0) {
+ memcpy(data.get(), ptr, size);
+ }
}
JsonbField(const JsonbField& x) : size(x.size) {
- data = new char[size];
+ data = std::make_unique<char[]>(size);
if (!data) {
LOG(FATAL) << "new data buffer failed, size: " << size;
}
- memcpy(data, x.data, size);
+ if (size > 0) {
+ memcpy(data.get(), x.data.get(), size);
+ }
}
- JsonbField(JsonbField&& x) : data(x.data), size(x.size) {
- x.data = nullptr;
- x.size = 0;
- }
+ JsonbField(JsonbField&& x) noexcept : data(std::move(x.data)),
size(x.size) { x.size = 0; }
+ // dispatch for all type of storage. so need this. but not really used now.
JsonbField& operator=(const JsonbField& x) {
- data = new char[size];
- if (!data) {
- LOG(FATAL) << "new data buffer failed, size: " << size;
+ if (this != &x) {
+ data = std::make_unique<char[]>(x.size);
+ if (!data) {
+ LOG(FATAL) << "new data buffer failed, size: " << x.size;
+ }
+ if (x.size > 0) {
+ memcpy(data.get(), x.data.get(), x.size);
+ }
+ size = x.size;
}
- memcpy(data, x.data, size);
return *this;
}
- JsonbField& operator=(JsonbField&& x) {
- if (data) {
- delete[] data;
+ JsonbField& operator=(JsonbField&& x) noexcept {
+ if (this != &x) {
+ data = std::move(x.data);
+ size = x.size;
+ x.size = 0;
}
- data = x.data;
- size = x.size;
- x.data = nullptr;
- x.size = 0;
return *this;
}
- ~JsonbField() {
- if (data) {
- delete[] data;
- }
- }
-
- const char* get_value() const { return data; }
- uint32_t get_size() const { return size; }
+ const char* get_value() const { return data.get(); }
+ size_t get_size() const { return size; }
bool operator<(const JsonbField& r) const {
LOG(FATAL) << "comparing between JsonbField is not supported";
@@ -246,8 +248,8 @@ public:
}
private:
- char* data = nullptr;
- uint32_t size = 0;
+ std::unique_ptr<char[]> data = nullptr;
+ size_t size = 0;
};
template <typename T>
diff --git a/be/src/vec/data_types/data_type_jsonb.h
b/be/src/vec/data_types/data_type_jsonb.h
index 41b211a09c2..e5b330061f2 100644
--- a/be/src/vec/data_types/data_type_jsonb.h
+++ b/be/src/vec/data_types/data_type_jsonb.h
@@ -18,9 +18,9 @@
#pragma once
#include <gen_cpp/Types_types.h>
-#include <stddef.h>
-#include <stdint.h>
+#include <cstddef>
+#include <cstdint>
#include <memory>
#include <string>
@@ -36,15 +36,11 @@
#include "vec/data_types/serde/data_type_serde.h"
#include "vec/data_types/serde/data_type_string_serde.h"
-namespace doris {
-namespace vectorized {
+namespace doris::vectorized {
class BufferWritable;
class IColumn;
class ReadBuffer;
-} // namespace vectorized
-} // namespace doris
-namespace doris::vectorized {
class DataTypeJsonb final : public IDataType {
public:
using ColumnType = ColumnString;
@@ -67,10 +63,13 @@ public:
MutableColumnPtr create_column() const override;
- virtual Field get_default() const override {
+ Field get_default() const override {
std::string default_json = "{}";
- JsonBinaryValue binary_val(default_json.c_str(), default_json.size());
- return JsonbField(binary_val.value(), binary_val.size());
+ // convert default_json to binary
+ JsonBinaryValue binary_val(default_json.c_str(),
static_cast<Int32>(default_json.size()));
+ // Throw exception if default_json.size() is large than INT32_MAX
+ // JsonbField keeps its own memory
+ return JsonbField(binary_val.value(),
static_cast<UInt32>(binary_val.size()));
}
Field get_field(const TExprNode& node) const override {
@@ -99,4 +98,5 @@ public:
private:
DataTypeString data_type_string;
};
-} // namespace doris::vectorized
\ No newline at end of file
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/io/io_helper.h b/be/src/vec/io/io_helper.h
index d5ca522146a..f3dc3d1b131 100644
--- a/be/src/vec/io/io_helper.h
+++ b/be/src/vec/io/io_helper.h
@@ -30,6 +30,7 @@
#include "vec/common/string_buffer.hpp"
#include "vec/common/string_ref.h"
#include "vec/common/uint128.h"
+#include "vec/core/field.h"
#include "vec/core/types.h"
#include "vec/io/reader_buffer.h"
#include "vec/io/var_int.h"
@@ -126,7 +127,7 @@ inline void write_string_binary(const char* s,
BufferWritable& buf) {
write_string_binary(StringRef {std::string(s)}, buf);
}
-inline void write_json_binary(JsonbField s, BufferWritable& buf) {
+inline void write_json_binary(const JsonbField& s, BufferWritable& buf) {
write_string_binary(StringRef {s.get_value(), s.get_size()}, buf);
}
@@ -200,13 +201,14 @@ inline StringRef read_string_binary_into(Arena& arena,
BufferReadable& buf) {
char* data = arena.alloc(size);
buf.read(data, size);
- return StringRef(data, size);
+ return {data, size};
}
-inline void read_json_binary(JsonbField val, BufferReadable& buf,
+inline void read_json_binary(JsonbField& val, BufferReadable& buf,
size_t MAX_JSON_SIZE = DEFAULT_MAX_JSON_SIZE) {
- StringRef jrf = StringRef {val.get_value(), val.get_size()};
- read_string_binary(jrf, buf);
+ StringRef result;
+ read_string_binary(result, buf);
+ val = JsonbField(result.data, result.size);
}
template <typename Type>
diff --git a/be/src/vec/io/var_int.h b/be/src/vec/io/var_int.h
index eda4b6b3873..eb06a1ccb1d 100644
--- a/be/src/vec/io/var_int.h
+++ b/be/src/vec/io/var_int.h
@@ -93,35 +93,44 @@ inline void read_var_uint(UInt64& x, std::istream& istr) {
for (size_t i = 0; i < 9; ++i) {
UInt64 byte = istr.get();
x |= (byte & 0x7F) << (7 * i);
- if (!(byte & 0x80)) return;
+ if (!(byte & 0x80)) {
+ return;
+ }
}
}
inline void write_var_uint(UInt64 x, std::ostream& ostr) {
for (size_t i = 0; i < 9; ++i) {
uint8_t byte = x & 0x7F;
- if (x > 0x7F) byte |= 0x80;
+ if (x > 0x7F) {
+ byte |= 0x80;
+ }
ostr.put(byte);
x >>= 7;
- if (!x) return;
+ if (!x) {
+ return;
+ }
}
}
// TODO: do real implement in the future
inline void read_var_uint(UInt64& x, BufferReadable& buf) {
x = 0;
+ // get length from first byte firstly
uint8_t len = 0;
buf.read((char*)&len, 1);
auto ref = buf.read(len);
-
+ // read data and set it to x per byte.
char* bytes = const_cast<char*>(ref.data);
for (size_t i = 0; i < 9; ++i) {
UInt64 byte = bytes[i];
x |= (byte & 0x7F) << (7 * i);
- if (!(byte & 0x80)) return;
+ if (!(byte & 0x80)) {
+ return;
+ }
}
}
@@ -130,12 +139,16 @@ inline void write_var_uint(UInt64 x, BufferWritable&
ostr) {
uint8_t i = 0;
while (i < 9) {
uint8_t byte = x & 0x7F;
- if (x > 0x7F) byte |= 0x80;
+ if (x > 0x7F) {
+ byte |= 0x80;
+ }
bytes[i++] = byte;
x >>= 7;
- if (!x) break;
+ if (!x) {
+ break;
+ }
}
ostr.write((char*)&i, 1);
ostr.write(bytes, i);
@@ -144,13 +157,17 @@ inline void write_var_uint(UInt64 x, BufferWritable&
ostr) {
inline char* write_var_uint(UInt64 x, char* ostr) {
for (size_t i = 0; i < 9; ++i) {
uint8_t byte = x & 0x7F;
- if (x > 0x7F) byte |= 0x80;
+ if (x > 0x7F) {
+ byte |= 0x80;
+ }
*ostr = byte;
++ostr;
x >>= 7;
- if (!x) return ostr;
+ if (!x) {
+ return ostr;
+ }
}
return ostr;
diff --git a/be/test/vec/core/field_test.cpp b/be/test/vec/core/field_test.cpp
new file mode 100644
index 00000000000..d1b393bf7cd
--- /dev/null
+++ b/be/test/vec/core/field_test.cpp
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/core/field.h"
+
+#include <gtest/gtest-message.h>
+#include <gtest/gtest-test-part.h>
+
+#include <string>
+
+#include "gtest/gtest_pred_impl.h" // IWYU pragma: keep
+#include "runtime/define_primitive_type.h"
+#include "vec/columns/column_string.h"
+#include "vec/common/string_buffer.hpp"
+#include "vec/common/string_ref.h"
+#include "vec/core/types.h"
+#include "vec/io/io_helper.h"
+
+namespace doris::vectorized {
+
+TEST(VFieldTest, jsonb_field_unique_ptr) {
+ // Test default constructor
+ JsonbField empty;
+ ASSERT_EQ(empty.get_value(), nullptr);
+ ASSERT_EQ(empty.get_size(), 0);
+
+ // Test constructor with data
+ const char* test_data = R"({ "key": "value" })";
+ size_t test_size = strlen(test_data);
+ JsonbField jf1(test_data, test_size);
+ ASSERT_NE(jf1.get_value(), nullptr);
+ ASSERT_EQ(jf1.get_size(), test_size);
+ ASSERT_EQ(std::string(jf1.get_value(), jf1.get_size()),
std::string(test_data));
+
+ // Test copy constructor
+ JsonbField jf2(jf1);
+ ASSERT_NE(jf2.get_value(), nullptr);
+ ASSERT_NE(jf2.get_value(), jf1.get_value()); // Different memory locations
+ ASSERT_EQ(jf2.get_size(), jf1.get_size());
+ ASSERT_EQ(std::string(jf2.get_value(), jf2.get_size()),
+ std::string(jf1.get_value(), jf1.get_size()));
+
+ // Test move constructor
+ JsonbField jf3(std::move(jf2));
+ ASSERT_NE(jf3.get_value(), nullptr);
+ ASSERT_EQ(jf2.get_value(), nullptr); // jf2 should be empty after move
+ ASSERT_EQ(jf2.get_size(), 0); // jf2 size should be 0 after move
+ ASSERT_EQ(jf3.get_size(), test_size);
+ ASSERT_EQ(std::string(jf3.get_value(), jf3.get_size()),
std::string(test_data));
+
+ // Test copy assignment
+ JsonbField jf4;
+ jf4 = jf1;
+ ASSERT_NE(jf4.get_value(), nullptr);
+ ASSERT_NE(jf4.get_value(), jf1.get_value()); // Different memory locations
+ ASSERT_EQ(jf4.get_size(), jf1.get_size());
+ ASSERT_EQ(std::string(jf4.get_value(), jf4.get_size()),
+ std::string(jf1.get_value(), jf1.get_size()));
+
+ // Test move assignment
+ JsonbField jf5;
+ jf5 = std::move(jf4);
+ ASSERT_NE(jf5.get_value(), nullptr);
+ ASSERT_EQ(jf4.get_value(), nullptr); // jf4 should be empty after move
+ ASSERT_EQ(jf4.get_size(), 0); // jf4 size should be 0 after move
+ ASSERT_EQ(jf5.get_size(), test_size);
+ ASSERT_EQ(std::string(jf5.get_value(), jf5.get_size()),
std::string(test_data));
+
+ // Test JsonbField with Field
+ Field field_jf = jf1;
+ ASSERT_EQ(field_jf.get_type(), Field::Types::JSONB);
+ ASSERT_NE(field_jf.get<JsonbField>().get_value(), nullptr);
+ ASSERT_EQ(field_jf.get<JsonbField>().get_size(), test_size);
+ ASSERT_EQ(std::string(field_jf.get<JsonbField>().get_value(),
+ field_jf.get<JsonbField>().get_size()),
+ std::string(test_data));
+}
+
+// Test for JsonbField I/O operations
+TEST(VFieldTest, jsonb_field_io) {
+ // Prepare a JsonbField
+ const char* test_data = R"({ "key": "value" })";
+ size_t test_size = strlen(test_data);
+ JsonbField original(test_data, test_size);
+
+ // TEST 1: write_json_binary - From JsonbField to buffer
+ // Create a ColumnString to use with BufferWritable
+ ColumnString column_str;
+
+ // Write the JsonbField to the buffer
+ {
+ BufferWritable buf(column_str);
+ write_json_binary(original, buf);
+ buf.commit(); // Important: commit the write operation
+ }
+
+ // Verify data was written
+ ASSERT_GT(column_str.size(), 0);
+
+ // Read the JsonbField back using BufferReadable
+ {
+ // Get the StringRef from ColumnString
+ StringRef str_ref = column_str.get_data_at(0);
+
+ // Create a BufferReadable from StringRef
+ BufferReadable read_buf(str_ref);
+
+ // Read the data back into a new JsonbField
+ JsonbField read_field;
+ read_json_binary(read_field, read_buf);
+
+ // Verify the data
+ ASSERT_NE(read_field.get_value(), nullptr);
+ ASSERT_EQ(read_field.get_size(), original.get_size());
+ ASSERT_EQ(std::string(read_field.get_value(), read_field.get_size()),
+ std::string(original.get_value(), original.get_size()));
+ }
+
+ // Test with JsonbField as a Field and serde it
+ {
+ ColumnString field_column;
+
+ // ser
+ {
+ BufferWritable field_buf(field_column);
+ write_json_binary(original, field_buf);
+ field_buf.commit();
+ }
+
+ // Verify field was written
+ ASSERT_GT(field_column.size(), 0);
+
+ // de
+ {
+ StringRef field_str_ref = field_column.get_data_at(0);
+ BufferReadable read_field_buf(field_str_ref);
+
+ // we can't use read_binary because of the JsonbField is not POD
type
+ JsonbField jsonb_from_field;
+ read_json_binary(jsonb_from_field, read_field_buf);
+ Field f2 = jsonb_from_field;
+
+ ASSERT_EQ(f2.get_type(), Field::Types::JSONB);
+ ASSERT_NE(f2.get<JsonbField>().get_value(), nullptr);
+ ASSERT_EQ(
+ std::string(f2.get<JsonbField>().get_value(),
f2.get<JsonbField>().get_size()),
+ std::string(test_data));
+ }
+ }
+}
+
+} // namespace doris::vectorized
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]