This is an automated email from the ASF dual-hosted git repository.

liaoxin01 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 99cb71a59b8 [opt](memory) truncate segment key bounds before storing 
segment stats (#63469)
99cb71a59b8 is described below

commit 99cb71a59b8bd6a5e83d19853104dd146275cac9
Author: hui lai <[email protected]>
AuthorDate: Tue May 26 15:41:54 2026 +0800

    [opt](memory) truncate segment key bounds before storing segment stats 
(#63469)
    
    ### What problem does this PR solve?
    
    Problem Summary:
    
    For MOW load, segment key bounds were previously truncated only when
    building `RowsetMeta`. Before that point, `BaseBetaRowsetWriter` kept
    full per-segment `min_key` and `max_key` in `SegmentStatistics` inside
    `_segid_statistics_map`.
    
    With very long primary keys and high-concurrency stream load, this
    temporary rowset writer bookkeeping can consume large peak heap memory
    even when `segments_key_bounds_truncation_threshold` is configured.
    
    This change applies the configured key bounds truncation before storing
    `SegmentStatistics` in the rowset writer, and preserves the
    `segments_key_bounds_truncated` marker for later comparison logic.
    
    ### Test
    
    - MOW table, primary key VARCHAR(49000) (approaching the 50000-byte
    limit)
    - 512 Hash Buckets, monthly Range partitions (2021-01 ~ 2026-12)
    - 60 concurrent Stream Loads, targeting 60 TPS, with each batch
    approximately 100 MB (uncompressed)
    - Continuously running for FDB_RUN_SECONDS (default 24 hours)
    
    Optimizing memory usage from 20G to 10G:
    
    before:
    <img width="310" height="340" alt="image"
    
src="https://github.com/user-attachments/assets/d79a6366-1d03-4563-b65a-88b8f2bd9b50";
    />
    
    after:
    <img width="358" height="328" alt="image"
    
src="https://github.com/user-attachments/assets/3779734e-069a-4fd7-bd09-2cb22c0e1b41";
    />
---
 be/src/storage/rowset/beta_rowset_writer.cpp | 42 +++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/be/src/storage/rowset/beta_rowset_writer.cpp 
b/be/src/storage/rowset/beta_rowset_writer.cpp
index befb9be491c..b9de50944d6 100644
--- a/be/src/storage/rowset/beta_rowset_writer.cpp
+++ b/be/src/storage/rowset/beta_rowset_writer.cpp
@@ -82,6 +82,29 @@ bool is_segment_overlapping(const std::vector<KeyBoundsPB>& 
segments_encoded_key
     return false;
 }
 
+bool truncate_key_bounds(KeyBoundsPB* key_bounds) {
+    DCHECK(key_bounds != nullptr);
+    if (config::random_segments_key_bounds_truncation) {
+        return false;
+    }
+    const int32_t truncation_threshold = 
config::segments_key_bounds_truncation_threshold;
+    if (truncation_threshold <= 0) {
+        return false;
+    }
+    const size_t truncation_size = cast_set<size_t>(truncation_threshold);
+
+    bool truncated = false;
+    if (key_bounds->min_key().size() > truncation_size) {
+        key_bounds->mutable_min_key()->resize(truncation_size);
+        truncated = true;
+    }
+    if (key_bounds->max_key().size() > truncation_size) {
+        key_bounds->mutable_max_key()->resize(truncation_size);
+        truncated = true;
+    }
+    return truncated;
+}
+
 void build_rowset_meta_with_spec_field(RowsetMeta& rowset_meta,
                                        const RowsetMeta& spec_rowset_meta) {
     rowset_meta.set_num_rows(spec_rowset_meta.num_rows());
@@ -991,6 +1014,7 @@ Status 
BaseBetaRowsetWriter::_build_rowset_meta(RowsetMeta* rowset_meta, bool ch
     int64_t total_index_size = 0;
     std::vector<KeyBoundsPB> segments_encoded_key_bounds;
     std::vector<uint32_t> segment_rows;
+    std::optional<bool> segments_key_bounds_truncated;
     {
         std::lock_guard<std::mutex> lock(_segid_statistics_map_mutex);
         for (const auto& itr : _segid_statistics_map) {
@@ -1001,6 +1025,7 @@ Status 
BaseBetaRowsetWriter::_build_rowset_meta(RowsetMeta* rowset_meta, bool ch
             // segcompaction don't modify _segment_num_rows, so we need to get 
segment rows from _segid_statistics_map for load
             segment_rows.push_back(cast_set<uint32_t>(itr.second.row_num));
         }
+        segments_key_bounds_truncated = _segments_key_bounds_truncated;
     }
     if (segment_rows.empty()) {
         // vertical compaction and linked schema change will not record 
segment statistics,
@@ -1011,8 +1036,8 @@ Status 
BaseBetaRowsetWriter::_build_rowset_meta(RowsetMeta* rowset_meta, bool ch
     for (auto& key_bound : _segments_encoded_key_bounds) {
         segments_encoded_key_bounds.push_back(key_bound);
     }
-    if (_segments_key_bounds_truncated.has_value()) {
-        
rowset_meta->set_segments_key_bounds_truncated(_segments_key_bounds_truncated.value());
+    if (segments_key_bounds_truncated.has_value()) {
+        
rowset_meta->set_segments_key_bounds_truncated(segments_key_bounds_truncated.value());
     }
     rowset_meta->set_num_segment_rows(segment_rows);
     // segment key bounds are empty in old version(before version 1.2.x). So 
we should not modify
@@ -1199,14 +1224,19 @@ Status 
BetaRowsetWriter::_check_segment_number_limit(size_t segnum) {
 
 Status BaseBetaRowsetWriter::add_segment(uint32_t segment_id, const 
SegmentStatistics& segstat) {
     uint32_t segid_offset = segment_id - _segment_start_id;
+    SegmentStatistics stored_segstat = segstat;
+    const bool key_bounds_truncated = 
truncate_key_bounds(&stored_segstat.key_bounds);
     {
         std::lock_guard<std::mutex> lock(_segid_statistics_map_mutex);
         CHECK_EQ(_segid_statistics_map.find(segment_id) == 
_segid_statistics_map.end(), true);
-        _segid_statistics_map.emplace(segment_id, segstat);
+        _segid_statistics_map.emplace(segment_id, std::move(stored_segstat));
         if (segment_id >= _segment_num_rows.size()) {
             _segment_num_rows.resize(segment_id + 1);
         }
         _segment_num_rows[segid_offset] = cast_set<uint32_t>(segstat.row_num);
+        if (key_bounds_truncated) {
+            _segments_key_bounds_truncated = true;
+        }
     }
     VLOG_DEBUG << "_segid_statistics_map add new record. segment_id:" << 
segment_id
                << " row_num:" << segstat.row_num << " data_size:" << 
segstat.data_size
@@ -1251,10 +1281,14 @@ Status 
BetaRowsetWriter::flush_segment_writer_for_segcompaction(
     segstat.data_size = segment_size;
     segstat.index_size = inverted_index_file_size;
     segstat.key_bounds = key_bounds;
+    const bool key_bounds_truncated = truncate_key_bounds(&segstat.key_bounds);
     {
         std::lock_guard<std::mutex> lock(_segid_statistics_map_mutex);
         CHECK_EQ(_segid_statistics_map.find(segid) == 
_segid_statistics_map.end(), true);
-        _segid_statistics_map.emplace(segid, segstat);
+        _segid_statistics_map.emplace(segid, std::move(segstat));
+        if (key_bounds_truncated) {
+            _segments_key_bounds_truncated = true;
+        }
     }
     VLOG_DEBUG << "_segid_statistics_map add new record. segid:" << segid << " 
row_num:" << row_num
                << " data_size:" << PrettyPrinter::print_bytes(segment_size)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to