This is an automated email from the ASF dual-hosted git repository.
liaoxin01 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 99cb71a59b8 [opt](memory) truncate segment key bounds before storing
segment stats (#63469)
99cb71a59b8 is described below
commit 99cb71a59b8bd6a5e83d19853104dd146275cac9
Author: hui lai <[email protected]>
AuthorDate: Tue May 26 15:41:54 2026 +0800
[opt](memory) truncate segment key bounds before storing segment stats
(#63469)
### What problem does this PR solve?
Problem Summary:
For MOW load, segment key bounds were previously truncated only when
building `RowsetMeta`. Before that point, `BaseBetaRowsetWriter` kept
full per-segment `min_key` and `max_key` in `SegmentStatistics` inside
`_segid_statistics_map`.
With very long primary keys and high-concurrency stream load, this
temporary rowset writer bookkeeping can consume large peak heap memory
even when `segments_key_bounds_truncation_threshold` is configured.
This change applies the configured key bounds truncation before storing
`SegmentStatistics` in the rowset writer, and preserves the
`segments_key_bounds_truncated` marker for later comparison logic.
### Test
- MOW table, primary key VARCHAR(49000) (approaching the 50000-byte
limit)
- 512 Hash Buckets, monthly Range partitions (2021-01 ~ 2026-12)
- 60 concurrent Stream Loads, targeting 60 TPS, with each batch
approximately 100 MB (uncompressed)
- Continuously running for FDB_RUN_SECONDS (default 24 hours)
Optimizing memory usage from 20G to 10G:
before:
<img width="310" height="340" alt="image"
src="https://github.com/user-attachments/assets/d79a6366-1d03-4563-b65a-88b8f2bd9b50"
/>
after:
<img width="358" height="328" alt="image"
src="https://github.com/user-attachments/assets/3779734e-069a-4fd7-bd09-2cb22c0e1b41"
/>
---
be/src/storage/rowset/beta_rowset_writer.cpp | 42 +++++++++++++++++++++++++---
1 file changed, 38 insertions(+), 4 deletions(-)
diff --git a/be/src/storage/rowset/beta_rowset_writer.cpp
b/be/src/storage/rowset/beta_rowset_writer.cpp
index befb9be491c..b9de50944d6 100644
--- a/be/src/storage/rowset/beta_rowset_writer.cpp
+++ b/be/src/storage/rowset/beta_rowset_writer.cpp
@@ -82,6 +82,29 @@ bool is_segment_overlapping(const std::vector<KeyBoundsPB>&
segments_encoded_key
return false;
}
+bool truncate_key_bounds(KeyBoundsPB* key_bounds) {
+ DCHECK(key_bounds != nullptr);
+ if (config::random_segments_key_bounds_truncation) {
+ return false;
+ }
+ const int32_t truncation_threshold =
config::segments_key_bounds_truncation_threshold;
+ if (truncation_threshold <= 0) {
+ return false;
+ }
+ const size_t truncation_size = cast_set<size_t>(truncation_threshold);
+
+ bool truncated = false;
+ if (key_bounds->min_key().size() > truncation_size) {
+ key_bounds->mutable_min_key()->resize(truncation_size);
+ truncated = true;
+ }
+ if (key_bounds->max_key().size() > truncation_size) {
+ key_bounds->mutable_max_key()->resize(truncation_size);
+ truncated = true;
+ }
+ return truncated;
+}
+
void build_rowset_meta_with_spec_field(RowsetMeta& rowset_meta,
const RowsetMeta& spec_rowset_meta) {
rowset_meta.set_num_rows(spec_rowset_meta.num_rows());
@@ -991,6 +1014,7 @@ Status
BaseBetaRowsetWriter::_build_rowset_meta(RowsetMeta* rowset_meta, bool ch
int64_t total_index_size = 0;
std::vector<KeyBoundsPB> segments_encoded_key_bounds;
std::vector<uint32_t> segment_rows;
+ std::optional<bool> segments_key_bounds_truncated;
{
std::lock_guard<std::mutex> lock(_segid_statistics_map_mutex);
for (const auto& itr : _segid_statistics_map) {
@@ -1001,6 +1025,7 @@ Status
BaseBetaRowsetWriter::_build_rowset_meta(RowsetMeta* rowset_meta, bool ch
// segcompaction don't modify _segment_num_rows, so we need to get
segment rows from _segid_statistics_map for load
segment_rows.push_back(cast_set<uint32_t>(itr.second.row_num));
}
+ segments_key_bounds_truncated = _segments_key_bounds_truncated;
}
if (segment_rows.empty()) {
// vertical compaction and linked schema change will not record
segment statistics,
@@ -1011,8 +1036,8 @@ Status
BaseBetaRowsetWriter::_build_rowset_meta(RowsetMeta* rowset_meta, bool ch
for (auto& key_bound : _segments_encoded_key_bounds) {
segments_encoded_key_bounds.push_back(key_bound);
}
- if (_segments_key_bounds_truncated.has_value()) {
-
rowset_meta->set_segments_key_bounds_truncated(_segments_key_bounds_truncated.value());
+ if (segments_key_bounds_truncated.has_value()) {
+
rowset_meta->set_segments_key_bounds_truncated(segments_key_bounds_truncated.value());
}
rowset_meta->set_num_segment_rows(segment_rows);
// segment key bounds are empty in old version(before version 1.2.x). So
we should not modify
@@ -1199,14 +1224,19 @@ Status
BetaRowsetWriter::_check_segment_number_limit(size_t segnum) {
Status BaseBetaRowsetWriter::add_segment(uint32_t segment_id, const
SegmentStatistics& segstat) {
uint32_t segid_offset = segment_id - _segment_start_id;
+ SegmentStatistics stored_segstat = segstat;
+ const bool key_bounds_truncated =
truncate_key_bounds(&stored_segstat.key_bounds);
{
std::lock_guard<std::mutex> lock(_segid_statistics_map_mutex);
CHECK_EQ(_segid_statistics_map.find(segment_id) ==
_segid_statistics_map.end(), true);
- _segid_statistics_map.emplace(segment_id, segstat);
+ _segid_statistics_map.emplace(segment_id, std::move(stored_segstat));
if (segment_id >= _segment_num_rows.size()) {
_segment_num_rows.resize(segment_id + 1);
}
_segment_num_rows[segid_offset] = cast_set<uint32_t>(segstat.row_num);
+ if (key_bounds_truncated) {
+ _segments_key_bounds_truncated = true;
+ }
}
VLOG_DEBUG << "_segid_statistics_map add new record. segment_id:" <<
segment_id
<< " row_num:" << segstat.row_num << " data_size:" <<
segstat.data_size
@@ -1251,10 +1281,14 @@ Status
BetaRowsetWriter::flush_segment_writer_for_segcompaction(
segstat.data_size = segment_size;
segstat.index_size = inverted_index_file_size;
segstat.key_bounds = key_bounds;
+ const bool key_bounds_truncated = truncate_key_bounds(&segstat.key_bounds);
{
std::lock_guard<std::mutex> lock(_segid_statistics_map_mutex);
CHECK_EQ(_segid_statistics_map.find(segid) ==
_segid_statistics_map.end(), true);
- _segid_statistics_map.emplace(segid, segstat);
+ _segid_statistics_map.emplace(segid, std::move(segstat));
+ if (key_bounds_truncated) {
+ _segments_key_bounds_truncated = true;
+ }
}
VLOG_DEBUG << "_segid_statistics_map add new record. segid:" << segid << "
row_num:" << row_num
<< " data_size:" << PrettyPrinter::print_bytes(segment_size)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]