zhiqiang-hhhh commented on code in PR #33331: URL: https://github.com/apache/doris/pull/33331#discussion_r1556805502
########## be/src/runtime/runtime_query_statistics_mgr.cpp: ########## @@ -17,14 +17,432 @@ #include "runtime/runtime_query_statistics_mgr.h" +#include <gen_cpp/FrontendService_types.h> +#include <gen_cpp/RuntimeProfile_types.h> +#include <gen_cpp/Status_types.h> +#include <gen_cpp/Types_types.h> +#include <thrift/TApplicationException.h> + +#include <condition_variable> +#include <cstdint> +#include <memory> +#include <mutex> +#include <random> +#include <shared_mutex> +#include <string> +#include <tuple> +#include <unordered_map> +#include <vector> + +#include "common/logging.h" #include "runtime/client_cache.h" #include "runtime/exec_env.h" +#include "runtime/query_context.h" +#include "service/backend_options.h" #include "util/debug_util.h" +#include "util/hash_util.hpp" #include "util/time.h" +#include "util/uid_util.h" #include "vec/core/block.h" namespace doris { +static Status _do_report_exec_stats_rpc(const TNetworkAddress& coor_addr, + const TReportExecStatusParams& req, + TReportExecStatusResult& res) { + Status client_status; + FrontendServiceConnection rpc_client(ExecEnv::GetInstance()->frontend_client_cache(), coor_addr, + &client_status); + if (!client_status.ok()) { + LOG_WARNING( + "could not get client rpc client of {} when reporting profiles, reason is {}, " + "not reporting, profile will be lost", + PrintThriftNetworkAddress(coor_addr), client_status.to_string()); + return Status::RpcError("Client rpc client failed"); + } + + try { + try { + rpc_client->reportExecStatus(res, req); + } catch (const apache::thrift::transport::TTransportException& e) { + LOG_WARNING("Transport exception from {}, reason: {}, reopening", + PrintThriftNetworkAddress(coor_addr), e.what()); + client_status = rpc_client.reopen(config::thrift_rpc_timeout_ms); + if (!client_status.ok()) { + LOG_WARNING("Reopen failed, reason: {}", client_status.to_string()); + return Status::RpcError("Open rpc client failed"); + } + + rpc_client->reportExecStatus(res, req); + } + } catch (apache::thrift::TApplicationException& e) { + if (e.getType() == e.UNKNOWN_METHOD) { + LOG_WARNING( + "Failed to send statistics to {} due to {}, usually because the frontend " + "is not upgraded, check the version", + PrintThriftNetworkAddress(coor_addr), e.what()); + } else { + LOG_WARNING( + "Failed to send statistics to {}, reason: {}, you can see fe log for " + "details.", + PrintThriftNetworkAddress(coor_addr), e.what()); + } + return Status::RpcError("Send stats failed"); + } catch (std::exception& e) { + LOG_WARNING("Failed to send statistics to {}, reason: {}, you can see fe log for details.", + PrintThriftNetworkAddress(coor_addr), e.what()); + return Status::RpcError("Send stats failed"); + } + + return Status::OK(); +} + +TReportExecStatusParams RuntimeQueryStatiticsMgr::create_report_exec_status_params_x( + const TUniqueId& query_id, + const std::unordered_map<int32, std::vector<std::shared_ptr<TRuntimeProfileTree>>>& + fragment_id_to_profile, + const std::vector<std::shared_ptr<TRuntimeProfileTree>>& load_channel_profiles) { + TQueryProfile profile; + profile.__set_query_id(query_id); + + std::map<int32_t, std::vector<TDetailedReportParams>> fragment_id_to_profile_req; + + for (const auto& entry : fragment_id_to_profile) { + int32_t fragment_id = entry.first; + const std::vector<std::shared_ptr<TRuntimeProfileTree>>& fragment_profile = entry.second; + std::vector<TDetailedReportParams> detailed_params; + + for (auto pipeline_profile : fragment_profile) { + if (pipeline_profile == nullptr) { + auto msg = fmt::format("Register fragment profile {} {} failed, profile is null", + print_id(query_id), fragment_id); + DCHECK(false) << msg; + LOG_ERROR(msg); + continue; + } + + TDetailedReportParams tmp; + tmp.__set_profile(*pipeline_profile); + // tmp.fragment_instance_id is not needed for pipeline x + detailed_params.push_back(tmp); + } + + fragment_id_to_profile_req.insert(std::make_pair(fragment_id, detailed_params)); + } + + if (fragment_id_to_profile_req.size() == 0) { + LOG_WARNING("No fragment profile found for query {}", print_id(query_id)); + } + + profile.__set_fragment_id_to_profile(fragment_id_to_profile_req); + + std::vector<TRuntimeProfileTree> load_channel_profiles_req; + for (auto load_channel_profile : load_channel_profiles) { + if (load_channel_profile == nullptr) { + auto msg = fmt::format( + "Register fragment profile {} {} failed, load channel profile is null", + print_id(query_id), -1); + DCHECK(false) << msg; + LOG_ERROR(msg); + continue; + } + + load_channel_profiles_req.push_back(*load_channel_profile); + } + + if (load_channel_profiles_req.size() > 0) { + profile.__set_load_channel_profiles(load_channel_profiles_req); + } + + TReportExecStatusParams req; + req.__set_query_profile(profile); + req.__set_query_id(TUniqueId()); + + return req; +} + +TReportExecStatusParams RuntimeQueryStatiticsMgr::create_report_exec_status_params_non_pipeline( + const TUniqueId& query_id, + const std::unordered_map<TUniqueId, std::shared_ptr<TRuntimeProfileTree>>& + instance_id_to_profile, + const std::vector<std::shared_ptr<TRuntimeProfileTree>>& load_channel_profile) { + TQueryProfile profile; + std::vector<TUniqueId> fragment_instance_ids; + std::vector<TRuntimeProfileTree> instance_profiles; + std::vector<TRuntimeProfileTree> load_channel_profiles; + + for (auto entry : instance_id_to_profile) { + TUniqueId instance_id = entry.first; + std::shared_ptr<TRuntimeProfileTree> profile = entry.second; + + if (profile == nullptr) { + auto msg = fmt::format("Register instance profile {} {} failed, profile is null", + print_id(query_id), print_id(instance_id)); + DCHECK(false) << msg; + LOG_ERROR(msg); + continue; + } + + fragment_instance_ids.push_back(instance_id); + instance_profiles.push_back(*profile); + } + + profile.__set_query_id(query_id); + profile.__set_fragment_instance_ids(fragment_instance_ids); + profile.__set_instance_profiles(instance_profiles); + profile.__set_load_channel_profiles(load_channel_profiles); + + TReportExecStatusParams res; + res.__set_query_profile(profile); + res.__set_query_id(TUniqueId()); + return res; +} + +void RuntimeQueryStatiticsMgr::start_report_thread() { + if (started.load()) { + DCHECK(false) << "report thread has been started"; + LOG_ERROR("report thread has been started"); + return; + } + + started.store(true); + + for (size_t i = 0; i < config::report_exec_status_thread_num; ++i) { + this->_report_profile_threads.emplace_back(std::make_unique<std::thread>( + &RuntimeQueryStatiticsMgr::report_query_profiles_thread, this)); + } +} + +void RuntimeQueryStatiticsMgr::report_query_profiles_thread() { + while (true) { + { + std::unique_lock<std::mutex> lock(_report_profile_mutex); + + while (_query_profile_map.empty() && _profile_map_x.empty() && + !_report_profile_thread_stop) { + _report_profile_cv.wait(lock); Review Comment: In what situation would this thread hang forever ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org