This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 28e4c6936ca [metric](cloud) add metrics for get tablets and rowsets (#51320) 28e4c6936ca is described below commit 28e4c6936ca1f298c6ca3b1e947209b3285ea32b Author: TengJianPing <tengjianp...@selectdb.com> AuthorDate: Sat May 31 18:05:56 2025 +0800 [metric](cloud) add metrics for get tablets and rowsets (#51320) --- be/src/cloud/config.cpp | 1 + be/src/cloud/config.h | 1 + be/src/pipeline/exec/olap_scan_operator.cpp | 34 +++++++++++++++++++++++++++++ be/src/util/doris_metrics.cpp | 6 +++++ be/src/util/doris_metrics.h | 3 +++ be/test/util/doris_metrics_test.cpp | 11 ++++++++++ 6 files changed, 56 insertions(+) diff --git a/be/src/cloud/config.cpp b/be/src/cloud/config.cpp index bc5c90e6e94..b0f80835598 100644 --- a/be/src/cloud/config.cpp +++ b/be/src/cloud/config.cpp @@ -40,6 +40,7 @@ DEFINE_Int64(tablet_cache_capacity, "100000"); DEFINE_Int64(tablet_cache_shards, "16"); DEFINE_mInt32(tablet_sync_interval_s, "1800"); DEFINE_mInt32(init_scanner_sync_rowsets_parallelism, "10"); +DEFINE_mInt32(sync_rowsets_slow_threshold_ms, "1000"); DEFINE_mInt64(min_compaction_failure_interval_ms, "5000"); DEFINE_mInt64(base_compaction_freeze_interval_s, "7200"); diff --git a/be/src/cloud/config.h b/be/src/cloud/config.h index 9e724082c9f..f7b85231cbd 100644 --- a/be/src/cloud/config.h +++ b/be/src/cloud/config.h @@ -71,6 +71,7 @@ DECLARE_Int64(tablet_cache_shards); DECLARE_mInt32(tablet_sync_interval_s); // parallelism for scanner init where may issue RPCs to sync rowset meta from MS DECLARE_mInt32(init_scanner_sync_rowsets_parallelism); +DECLARE_mInt32(sync_rowsets_slow_threshold_ms); // Cloud compaction config DECLARE_mInt64(min_compaction_failure_interval_ms); diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index 58a7730e8e5..b642c857dd1 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -513,6 +513,40 @@ Status OlapScanLocalState::hold_tablets() { COUNTER_UPDATE(_sync_rowset_get_remote_delete_bitmap_rpc_timer, sync_stats.get_remote_delete_bitmap_rpc_ns); } + auto time_ms = duration_ns / 1000 / 1000; + if (time_ms >= config::sync_rowsets_slow_threshold_ms) { + DorisMetrics::instance()->get_remote_tablet_slow_time_ms->increment(time_ms); + DorisMetrics::instance()->get_remote_tablet_slow_cnt->increment(1); + LOG_WARNING("get tablet takes too long") + .tag("query_id", print_id(PipelineXLocalState<>::_state->query_id())) + .tag("node_id", _parent->node_id()) + .tag("total_time", PrettyPrinter::print(duration_ns, TUnit::TIME_NS)) + .tag("num_tablets", _tablets.size()) + .tag("tablet_meta_cache_hit", _sync_rowset_tablet_meta_cache_hit->value()) + .tag("tablet_meta_cache_miss", _sync_rowset_tablet_meta_cache_miss->value()) + .tag("get_remote_tablet_meta_rpc_time", + PrettyPrinter::print( + _sync_rowset_get_remote_tablet_meta_rpc_timer->value(), + TUnit::TIME_NS)) + .tag("remote_rowsets_num", _sync_rowset_get_remote_rowsets_num->value()) + .tag("get_remote_rowsets_rpc_time", + PrettyPrinter::print(_sync_rowset_get_remote_rowsets_rpc_timer->value(), + TUnit::TIME_NS)) + .tag("local_delete_bitmap_rowsets_num", + _sync_rowset_get_local_delete_bitmap_rowsets_num->value()) + .tag("remote_delete_bitmap_rowsets_num", + _sync_rowset_get_remote_delete_bitmap_rowsets_num->value()) + .tag("remote_delete_bitmap_key_count", + _sync_rowset_get_remote_delete_bitmap_key_count->value()) + .tag("remote_delete_bitmap_bytes", + PrettyPrinter::print(_sync_rowset_get_remote_delete_bitmap_bytes->value(), + TUnit::BYTES)) + .tag("get_remote_delete_bitmap_rpc_time", + PrettyPrinter::print( + _sync_rowset_get_remote_delete_bitmap_rpc_timer->value(), + TUnit::TIME_NS)); + } + } else { for (size_t i = 0; i < _scan_ranges.size(); i++) { int64_t version = 0; diff --git a/be/src/util/doris_metrics.cpp b/be/src/util/doris_metrics.cpp index bed4624ba9c..653ec275ae9 100644 --- a/be/src/util/doris_metrics.cpp +++ b/be/src/util/doris_metrics.cpp @@ -214,6 +214,9 @@ DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(runtime_filter_consumer_wait_ready_ms, MetricUnit::MILLISECONDS); DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(runtime_filter_consumer_timeout_num, MetricUnit::NOUNIT); +DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(get_remote_tablet_slow_time_ms, MetricUnit::MILLISECONDS); +DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(get_remote_tablet_slow_cnt, MetricUnit::NOUNIT); + const std::string DorisMetrics::_s_registry_name = "doris_be"; const std::string DorisMetrics::_s_hook_name = "doris_metrics"; @@ -353,6 +356,9 @@ DorisMetrics::DorisMetrics() : _metric_registry(_s_registry_name) { INT_GAUGE_METRIC_REGISTER(_server_metric_entity, runtime_filter_consumer_ready_num); INT_COUNTER_METRIC_REGISTER(_server_metric_entity, runtime_filter_consumer_wait_ready_ms); INT_GAUGE_METRIC_REGISTER(_server_metric_entity, runtime_filter_consumer_timeout_num); + + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, get_remote_tablet_slow_time_ms); + INT_COUNTER_METRIC_REGISTER(_server_metric_entity, get_remote_tablet_slow_cnt); } void DorisMetrics::initialize(bool init_system_metrics, const std::set<std::string>& disk_devices, diff --git a/be/src/util/doris_metrics.h b/be/src/util/doris_metrics.h index 5b8515e7e5f..2a1827d06a1 100644 --- a/be/src/util/doris_metrics.h +++ b/be/src/util/doris_metrics.h @@ -241,6 +241,9 @@ public: IntCounter* runtime_filter_consumer_wait_ready_ms = nullptr; IntGauge* runtime_filter_consumer_timeout_num = nullptr; + IntCounter* get_remote_tablet_slow_time_ms = nullptr; + IntCounter* get_remote_tablet_slow_cnt = nullptr; + static DorisMetrics* instance() { static DorisMetrics instance; return &instance; diff --git a/be/test/util/doris_metrics_test.cpp b/be/test/util/doris_metrics_test.cpp index 6e9969b1210..588805e71b1 100644 --- a/be/test/util/doris_metrics_test.cpp +++ b/be/test/util/doris_metrics_test.cpp @@ -178,6 +178,17 @@ TEST_F(DorisMetricsTest, Normal) { EXPECT_TRUE(metric != nullptr); EXPECT_STREQ("40", metric->to_string().c_str()); } + { + DorisMetrics::instance()->get_remote_tablet_slow_time_ms->increment(1000); + auto* metric = server_entity->get_metric("get_remote_tablet_slow_time_ms"); + EXPECT_TRUE(metric != nullptr); + EXPECT_STREQ("1000", metric->to_string().c_str()); + + DorisMetrics::instance()->get_remote_tablet_slow_cnt->increment(10); + metric = server_entity->get_metric("get_remote_tablet_slow_cnt"); + EXPECT_TRUE(metric != nullptr); + EXPECT_STREQ("10", metric->to_string().c_str()); + } } } // namespace doris --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org