This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 5368bb19b07 [feature](be jvm monitor)append enable_jvm_monitor in be.conf to control jvm monitor. (#35608) 5368bb19b07 is described below commit 5368bb19b072791bbc6f33c0f814a281ce0f8f83 Author: daidai <2017501...@qq.com> AuthorDate: Sat Jun 1 10:14:51 2024 +0800 [feature](be jvm monitor)append enable_jvm_monitor in be.conf to control jvm monitor. (#35608) ## Proposed changes before pr : #35023 In order to prevent doris_be from crashing when collecting jvm information due to jvm incompatibility issues, you can set `enable_jvm_monitor = true / false` in `be.conf` to enable the jvm metrics. The default value of `enable_jvm_monitor` is false. When JVM monitoring has 30 consecutive exceptions, turn off JVM information collection and set all values to 0. Issue Number: close #xxx <!--Describe your changes.--> --------- Co-authored-by: morningman <morning...@163.com> --- be/src/common/config.cpp | 4 + be/src/common/config.h | 4 + be/src/util/jvm_metrics.cpp | 112 ++++++++++++++++++++----- be/src/util/jvm_metrics.h | 16 ++-- regression-test/pipeline/external/conf/be.conf | 3 + regression-test/pipeline/p0/conf/be.conf | 3 + regression-test/pipeline/p1/conf/be.conf | 3 + 7 files changed, 117 insertions(+), 28 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 3e8d72dba55..ee21430a970 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1283,8 +1283,12 @@ DEFINE_Int64(max_nonblock_close_thread_num, "64"); DEFINE_mDouble(mem_alloc_fault_probability, "0.0"); // The time out milliseconds for remote fetch schema RPC, default 60s DEFINE_mInt64(fetch_remote_schema_rpc_timeout_ms, "60000"); + DEFINE_Int64(s3_file_system_local_upload_buffer_size, "5242880"); +//JVM monitoring enable. To prevent be from crashing due to jvm compatibility issues. The default setting is off. +DEFINE_Bool(enable_jvm_monitor, "false"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 20d85077e3f..8df54c25318 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1361,8 +1361,12 @@ DECLARE_mDouble(mem_alloc_fault_probability); // The time out milliseconds for remote fetch schema RPC DECLARE_mInt64(fetch_remote_schema_rpc_timeout_ms); // The size of the local buffer for S3FileSytem's upload function + DECLARE_Int64(s3_file_system_local_upload_buffer_size); +//JVM monitoring enable. To prevent be from crashing due to jvm compatibility issues. +DECLARE_Bool(enable_jvm_monitor); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/util/jvm_metrics.cpp b/be/src/util/jvm_metrics.cpp index e55cf8f3fbe..fc30d1073ac 100644 --- a/be/src/util/jvm_metrics.cpp +++ b/be/src/util/jvm_metrics.cpp @@ -17,10 +17,12 @@ #include "jvm_metrics.h" +#include <util/jni-util.h> + #include <functional> +#include "common/config.h" #include "util/metrics.h" - namespace doris { #define DEFINE_JVM_SIZE_BYTES_METRIC(name, type) \ @@ -76,15 +78,28 @@ DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(jvm_gc_g1_old_generation_time_ms, MetricUni const char* JvmMetrics::_s_hook_name = "jvm_metrics"; -JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) : _jvm_stats(env) { +JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) { DCHECK(registry != nullptr); _registry = registry; _server_entity = _registry->register_entity("server"); DCHECK(_server_entity != nullptr); - if (_jvm_stats.init_complete()) { + + do { + if (!doris::config::enable_jvm_monitor) { + break; + } + try { + _jvm_stats.init(env); + } catch (...) { + LOG(WARNING) << "JVM STATS INIT FAIL"; + break; + } + if (!_jvm_stats.init_complete()) { + break; + } _server_entity->register_hook(_s_hook_name, std::bind(&JvmMetrics::update, this)); - } + } while (false); INT_GAUGE_METRIC_REGISTER(_server_entity, jvm_heap_size_bytes_max); INT_GAUGE_METRIC_REGISTER(_server_entity, jvm_heap_size_bytes_committed); @@ -117,11 +132,58 @@ JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) : _jvm_stats(env) } void JvmMetrics::update() { - _jvm_stats.refresh(this); + static long fail_count = 0; + bool have_exception = false; + try { + _jvm_stats.refresh(this); + } catch (...) { + have_exception = true; + LOG(WARNING) << "JVM MONITOR UPDATE FAIL!"; + fail_count++; + } + + //When 30 consecutive exceptions occur, turn off jvm information collection. + if (!have_exception) { + fail_count = 0; + } + if (fail_count >= 30) { + LOG(WARNING) << "JVM MONITOR CLOSE!"; + _jvm_stats.set_complete(false); + _server_entity->deregister_hook(_s_hook_name); + + jvm_heap_size_bytes_max->set_value(0); + jvm_heap_size_bytes_committed->set_value(0); + jvm_heap_size_bytes_used->set_value(0); + + jvm_non_heap_size_bytes_used->set_value(0); + jvm_non_heap_size_bytes_committed->set_value(0); + + jvm_young_size_bytes_used->set_value(0); + jvm_young_size_bytes_peak_used->set_value(0); + jvm_young_size_bytes_max->set_value(0); + + jvm_old_size_bytes_used->set_value(0); + jvm_old_size_bytes_peak_used->set_value(0); + jvm_old_size_bytes_max->set_value(0); + + jvm_thread_count->set_value(0); + jvm_thread_peak_count->set_value(0); + jvm_thread_new_count->set_value(0); + jvm_thread_runnable_count->set_value(0); + jvm_thread_blocked_count->set_value(0); + jvm_thread_waiting_count->set_value(0); + jvm_thread_timed_waiting_count->set_value(0); + jvm_thread_terminated_count->set_value(0); + + jvm_gc_g1_young_generation_count->set_value(0); + jvm_gc_g1_young_generation_time_ms->set_value(0); + jvm_gc_g1_old_generation_count->set_value(0); + jvm_gc_g1_old_generation_time_ms->set_value(0); + } } -#include <util/jni-util.h> -jvmStats::jvmStats(JNIEnv* ENV) : env(ENV) { +void JvmStats::init(JNIEnv* ENV) { + env = ENV; _managementFactoryClass = env->FindClass("java/lang/management/ManagementFactory"); if (_managementFactoryClass == nullptr) { LOG(WARNING) @@ -244,15 +306,19 @@ jvmStats::jvmStats(JNIEnv* ENV) : env(ENV) { LOG(INFO) << "Start JVM monitoring."; _init_complete = true; + return; } -#include "jni.h" - -void jvmStats::refresh(JvmMetrics* jvm_metrics) { +void JvmStats::refresh(JvmMetrics* jvm_metrics) { if (!_init_complete) { return; } - static_cast<void>(JniUtil::GetJNIEnv(&env)); + + Status st = JniUtil::GetJNIEnv(&env); + if (!st.ok()) { + LOG(WARNING) << "JVM STATS GET JNI ENV FAIL"; + return; + } jobject memoryMXBeanObj = env->CallStaticObjectMethod(_managementFactoryClass, _getMemoryMXBeanMethod); @@ -302,8 +368,8 @@ void jvmStats::refresh(JvmMetrics* jvm_metrics) { jstring name = (jstring)env->CallObjectMethod(memoryPoolMXBean, _getMemoryPollMXBeanNameMethod); - const char* nameStr = env->GetStringUTFChars(name, NULL); - if (nameStr != NULL) { + const char* nameStr = env->GetStringUTFChars(name, nullptr); + if (nameStr != nullptr) { auto it = _memoryPoolName.find(nameStr); if (it == _memoryPoolName.end()) { continue; @@ -408,16 +474,22 @@ void jvmStats::refresh(JvmMetrics* jvm_metrics) { env->DeleteLocalRef(threadMXBean); env->DeleteLocalRef(gcMXBeansList); } -jvmStats::~jvmStats() { +JvmStats::~JvmStats() { if (!_init_complete) { return; } - env->DeleteLocalRef(_newThreadStateObj); - env->DeleteLocalRef(_runnableThreadStateObj); - env->DeleteLocalRef(_blockedThreadStateObj); - env->DeleteLocalRef(_waitingThreadStateObj); - env->DeleteLocalRef(_timedWaitingThreadStateObj); - env->DeleteLocalRef(_terminatedThreadStateObj); + try { + env->DeleteLocalRef(_newThreadStateObj); + env->DeleteLocalRef(_runnableThreadStateObj); + env->DeleteLocalRef(_blockedThreadStateObj); + env->DeleteLocalRef(_waitingThreadStateObj); + env->DeleteLocalRef(_timedWaitingThreadStateObj); + env->DeleteLocalRef(_terminatedThreadStateObj); + + } catch (...) { + // When be is killed, DeleteLocalRef may fail. + // In order to exit more gracefully, we catch the exception here. + } } } // namespace doris diff --git a/be/src/util/jvm_metrics.h b/be/src/util/jvm_metrics.h index 5f9929d8cf0..459a3cbf938 100644 --- a/be/src/util/jvm_metrics.h +++ b/be/src/util/jvm_metrics.h @@ -17,8 +17,6 @@ #pragma once -#include <jni.h> - #include "jni.h" #include "util/jni-util.h" #include "util/metrics.h" @@ -27,7 +25,7 @@ namespace doris { class JvmMetrics; -class jvmStats { +class JvmStats { private: JNIEnv* env = nullptr; jclass _managementFactoryClass = nullptr; @@ -98,16 +96,18 @@ private: bool _init_complete = false; public: - jvmStats(JNIEnv* ENV); - bool init_complete() { return _init_complete; } + // JvmStats(JNIEnv* ENV); + void init(JNIEnv* ENV); + bool init_complete() const { return _init_complete; } + void set_complete(bool val) { _init_complete = val; } void refresh(JvmMetrics* jvm_metrics); - ~jvmStats(); + ~JvmStats(); }; class JvmMetrics { public: JvmMetrics(MetricRegistry* registry, JNIEnv* env); - ~JvmMetrics() {} + ~JvmMetrics() = default; void update(); IntGauge* jvm_heap_size_bytes_max = nullptr; @@ -140,7 +140,7 @@ public: IntGauge* jvm_gc_g1_old_generation_time_ms = nullptr; private: - jvmStats _jvm_stats; + JvmStats _jvm_stats; std::shared_ptr<MetricEntity> _server_entity; static const char* _s_hook_name; MetricRegistry* _registry = nullptr; diff --git a/regression-test/pipeline/external/conf/be.conf b/regression-test/pipeline/external/conf/be.conf index 85fedf8873f..6fb930cc5ea 100644 --- a/regression-test/pipeline/external/conf/be.conf +++ b/regression-test/pipeline/external/conf/be.conf @@ -72,3 +72,6 @@ enable_set_in_bitmap_value=true enable_feature_binlog=true trino_connector_plugin_dir=/tmp/trino_connector/connectors + +enable_jvm_monitor = true + diff --git a/regression-test/pipeline/p0/conf/be.conf b/regression-test/pipeline/p0/conf/be.conf index 474c30a05de..67605a5bdd9 100644 --- a/regression-test/pipeline/p0/conf/be.conf +++ b/regression-test/pipeline/p0/conf/be.conf @@ -60,3 +60,6 @@ enable_debug_points=true enable_debug_log_timeout_secs=0 trino_connector_plugin_dir=/tmp/trino_connector/connectors + +enable_jvm_monitor = true + diff --git a/regression-test/pipeline/p1/conf/be.conf b/regression-test/pipeline/p1/conf/be.conf index fde67fbbaf7..d278b30fb67 100644 --- a/regression-test/pipeline/p1/conf/be.conf +++ b/regression-test/pipeline/p1/conf/be.conf @@ -58,3 +58,6 @@ user_files_secure_path=/ enable_debug_points=true # debug scanner context dead loop enable_debug_log_timeout_secs=0 + +enable_jvm_monitor = true + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org