zhiqiang-hhhh commented on code in PR #23582: URL: https://github.com/apache/doris/pull/23582#discussion_r1308442174
########## be/src/agent/heartbeat_server.cpp: ########## @@ -219,6 +222,69 @@ Status HeartbeatServer::_heartbeat(const TMasterInfo& master_info) { _master_info->__set_backend_id(master_info.backend_id); } + if (master_info.__isset.frontend_infos) { + std::vector<TFrontendInfo> abnormal_fe_infos; + abnormal_fe_infos.reserve(_fe_infos.size()); + + // Step1: check change + std::set<TNetworkAddress> dropped_fes; + + for (const auto& cur_fe : _fe_infos) { + dropped_fes.insert(cur_fe.first); + } + + for (const auto& coming_fe_info : master_info.frontend_infos) { + auto itr = _fe_infos.find(coming_fe_info.network_address); + + if (itr == _fe_infos.end()) { + // A new frontend. + // Do not regard new frontend as a restart fe. Since it has no "expired" queries. + LOG(INFO) << "A completely new frontend, " << PrintFrontendInfo(coming_fe_info); + LOG(INFO) << "fe start uuid: " << coming_fe_info.start_uuid; + _fe_infos[coming_fe_info.network_address] = FeInternalInfo{coming_fe_info, 0}; + continue; + } + + dropped_fes.erase(coming_fe_info.network_address); + + // 0 start_uuid means this fe is in an UNKNOWN state for this be currently. + // Unkonw state means this fe could be: 1. still dead; 2. already startup (after this hb is sent). + // In the second situation, it is possible that the fe has already started to provide query service. + // So we will not cancel its query immediately, until we get a clear message (its next valid start_uuid). + if (coming_fe_info.start_uuid == itr->second._fe_info.start_uuid) { + continue; + } + + // If a fe is in a unknown state in three seq hbs, it will be regarded as an abnormal fe. + if (coming_fe_info.start_uuid == 0 && ++itr->second._unknown_counter < 3) { + continue; + } + + LOG(INFO) << "Got an abnormal frontend, address: " + << PrintThriftNetworkAddress(coming_fe_info.network_address) + << " previoud start uuid: " << itr->second._fe_info.start_uuid + << " new start uuid: " << coming_fe_info.start_uuid + << " , so we are going to canel all running fragments related to fe."; + + abnormal_fe_infos.emplace_back(coming_fe_info); + + _fe_infos[coming_fe_info.network_address] = FeInternalInfo{coming_fe_info, 0}; + } + + for (const auto& dropped_fe : dropped_fes) { + LOG(INFO) << "Frontend " << PrintThriftNetworkAddress(dropped_fe) << " has already been dropped, " + << " so going to cancel related fragments."; + + abnormal_fe_infos.emplace_back((*_fe_infos.find(dropped_fe)).second._fe_info); + _fe_infos.erase(dropped_fe); + } + + // Step2: notify cancel thread if any abnormal fes. + if (!abnormal_fe_infos.empty()) { + _exec_env->fragment_mgr()->add_abnormal_fe_infos_and_notify(abnormal_fe_infos); Review Comment: wrong function name, there is no notify (condition variables) operation, just get a light weighted lock, assign vector to fragment_mgr, and fragment_mgr::cancel_worker will check if any query shoule be canceled, NOTE: actual cancel operation will not be performed under this lock, all threads will just do read/write, and there is no io operation under the lock. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org