zhannngchen commented on code in PR #14595:
URL: https://github.com/apache/doris/pull/14595#discussion_r1049280771


##########
be/src/common/config.h:
##########
@@ -838,6 +838,9 @@ CONF_Int32(segcompaction_threshold_segment_num, "10");
 // The segment whose row number above the threshold will be compacted during 
segcompaction
 CONF_Int32(segcompaction_small_threshold, "1048576");
 
+// Enable final-pass segcompaction before commit txn
+CONF_Bool(enable_final_pass_segcompaction, "false");

Review Comment:
   We can make it as the default behavior os segment compaction?



##########
be/src/olap/rowset/beta_rowset_writer.cpp:
##########
@@ -552,6 +575,58 @@ Status 
BetaRowsetWriter::_segcompaction_ramaining_if_necessary() {
     return status;
 }
 
+Status BetaRowsetWriter::_segcompaction_finalpass_wait() {
+    Status status = Status::OK();
+    DCHECK_EQ(_is_doing_segcompaction, false);
+    if (!config::enable_storage_vectorization) {
+        return Status::OK();
+    }
+    if (_segcompaction_status.load() != OLAP_SUCCESS) {
+        return Status::OLAPInternalError(OLAP_ERR_SEGCOMPACTION_FAILED);
+    }
+
+    int64_t num_seg = _is_segcompacted() ? _num_segcompacted : _num_segment;
+    if (num_seg <= FINAL_SEGCOMPACTION_NUM_THRESHOLD) {
+        LOG(INFO) << "num_seg less than max_segment_num_per_rowset, skip final 
pass: " << num_seg
+                  << " vs " << config::max_segment_num_per_rowset;
+        return Status::OK();
+    }
+
+    // reset num counters for the final segcompaction
+    _num_segment = _num_segcompacted.load();
+    _num_segcompacted = 0;
+    _segcompacted_point = 0;
+
+    _total_row_num_written = 0;
+    {
+        std::lock_guard<std::mutex> lock(_segid_statistics_map_mutex);
+        for (const auto& itr : _segid_statistics_map) {
+            _total_row_num_written += itr.second.row_num;
+        }
+    }
+
+    while (_segcompacted_point != _num_segment) {
+        _is_doing_segcompaction = true;

Review Comment:
   no lock protection here?



##########
be/src/olap/rowset/beta_rowset_writer.cpp:
##########
@@ -552,6 +575,58 @@ Status 
BetaRowsetWriter::_segcompaction_ramaining_if_necessary() {
     return status;
 }
 
+Status BetaRowsetWriter::_segcompaction_finalpass_wait() {
+    Status status = Status::OK();
+    DCHECK_EQ(_is_doing_segcompaction, false);
+    if (!config::enable_storage_vectorization) {
+        return Status::OK();
+    }
+    if (_segcompaction_status.load() != OLAP_SUCCESS) {
+        return Status::OLAPInternalError(OLAP_ERR_SEGCOMPACTION_FAILED);
+    }
+
+    int64_t num_seg = _is_segcompacted() ? _num_segcompacted : _num_segment;
+    if (num_seg <= FINAL_SEGCOMPACTION_NUM_THRESHOLD) {
+        LOG(INFO) << "num_seg less than max_segment_num_per_rowset, skip final 
pass: " << num_seg

Review Comment:
   How about the case that the data size is quite big, and 
`total_size/MAX_SEGMENT_SIZE > FINAL_SEGCOMPACTION_NUM_THRESHOLD`?



##########
be/src/olap/rowset/beta_rowset_writer.cpp:
##########
@@ -400,19 +407,22 @@ Status BetaRowsetWriter::_load_noncompacted_segments(
             return Status::OLAPInternalError(OLAP_ERR_ROWSET_LOAD_FAILED);
         }
         segments->push_back(std::move(segment));
+        if (++segment_num >= FINAL_SEGCOMPACTION_MAX_SEGMENTS) {
+            break;
+        }
     }
     return Status::OK();
 }
 
 /* policy of segcompaction target selection:
- *  1. skip big segments
+ *  1. skip (and rename to keep segment id sequential when necessary) big 
segments
  *  2. if the consecutive smalls end up with a big, compact the smalls, except
  *     single small
  *  3. if the consecutive smalls end up with small, compact the smalls if the
  *     length is beyond (config::segcompaction_threshold_segment_num / 2)
  */
 Status BetaRowsetWriter::_find_longest_consecutive_small_segment(
-        SegCompactionCandidatesSharedPtr segments) {
+        SegCompactionCandidatesSharedPtr segments, size_t small_threshold, 
bool is_greedy) {

Review Comment:
   Add a comment to explain new parameter `is _greedy`



##########
be/src/olap/rowset/beta_rowset_writer.h:
##########
@@ -127,6 +144,12 @@ class BetaRowsetWriter : public RowsetWriter {
     bool _is_segment_overlapping(const std::vector<KeyBoundsPB>& 
segments_encoded_key_bounds);
 
 protected:
+    // segment num below will not trigger final segcompaction
+    const uint32_t FINAL_SEGCOMPACTION_NUM_THRESHOLD = 50;

Review Comment:
   I think we can merge this 2 const variables to 1



##########
be/src/olap/rowset/beta_rowset_writer.h:
##########
@@ -32,6 +32,18 @@ class FileWriter;
 using SegCompactionCandidates = std::vector<segment_v2::SegmentSharedPtr>;
 using SegCompactionCandidatesSharedPtr = 
std::shared_ptr<SegCompactionCandidates>;
 
+/* A load process can be divided into: 1) Loading 2) Publishing (Committing 
txn)

Review Comment:
   Publishing is not equal to committing, they are different phase



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to