gavinchou commented on code in PR #41782: URL: https://github.com/apache/doris/pull/41782#discussion_r1821257543
########## cloud/src/meta-service/meta_service_tablet_stats.cpp: ########## @@ -156,4 +165,239 @@ void internal_get_tablet_stats(MetaServiceCode& code, std::string& msg, Transact merge_tablet_stats(stats, detached_stats); } +MetaServiceResponseStatus parse_fix_tablet_stats_param( + std::shared_ptr<ResourceManager> resource_mgr, const std::string& table_id_str, + const std::string& cloud_unique_id_str, int64_t& table_id, std::string& instance_id) { + MetaServiceCode code = MetaServiceCode::OK; + std::string msg; + MetaServiceResponseStatus st; + st.set_code(MetaServiceCode::OK); + + // parse params + try { + table_id = std::stoll(table_id_str); + } catch (...) { + st.set_code(MetaServiceCode::INVALID_ARGUMENT); + st.set_msg("Invalid table_id, table_id: " + table_id_str); + return st; + } + + instance_id = get_instance_id(resource_mgr, cloud_unique_id_str); + if (instance_id.empty()) { + code = MetaServiceCode::INVALID_ARGUMENT; + msg = "empty instance_id"; + LOG(INFO) << msg << ", cloud_unique_id=" << cloud_unique_id_str; + st.set_code(code); + st.set_msg(msg); + return st; + } + return st; +} + +MetaServiceResponseStatus fix_tablet_stats_internal( + std::shared_ptr<TxnKv> txn_kv, std::pair<std::string, std::string>& key_pair, + std::vector<std::shared_ptr<TabletStatsPB>>& tablet_stat_shared_ptr_vec_batch, + const std::string& instance_id, size_t batch_size) { + std::unique_ptr<Transaction> txn; + MetaServiceResponseStatus st; + st.set_code(MetaServiceCode::OK); + MetaServiceCode code = MetaServiceCode::OK; + std::unique_ptr<RangeGetIterator> it; + std::vector<std::shared_ptr<TabletStatsPB>> tmp_tablet_stat_vec; + + TxnErrorCode err = txn_kv->create_txn(&txn); + if (err != TxnErrorCode::TXN_OK) { + st.set_code(cast_as<ErrCategory::CREATE>(err)); + st.set_msg("failed to create txn"); + return st; + } + + // read tablet stats + err = txn->get(key_pair.first, key_pair.second, &it, true); + if (err != TxnErrorCode::TXN_OK) { + st.set_code(cast_as<ErrCategory::READ>(err)); + st.set_msg(fmt::format("failed to get tablet stats, err={} ", err)); + return st; + } + + size_t tablet_cnt = 0; + while (it->has_next() && tablet_cnt < batch_size) { + auto [k, v] = it->next(); + key_pair.first = k; + auto k1 = k; + k1.remove_prefix(1); + std::vector<std::tuple<std::variant<int64_t, std::string>, int, int>> out; + decode_key(&k1, &out); + + // 0x01 "stats" ${instance_id} "tablet" ${table_id} ${index_id} ${partition_id} ${tablet_id} -> TabletStatsPB + if (out.size() == 7) { + tablet_cnt++; + TabletStatsPB tablet_stat; + tablet_stat.ParseFromArray(v.data(), v.size()); + tmp_tablet_stat_vec.emplace_back(std::make_shared<TabletStatsPB>(tablet_stat)); + } + } + if (it->has_next()) { + key_pair.first = it->next().first; + } + + for (const auto& tablet_stat_ptr : tmp_tablet_stat_vec) { + GetRowsetResponse resp; + std::string msg; + // get rowsets in tablet and accumulate disk size + internal_get_rowset(txn.get(), 0, std::numeric_limits<int64_t>::max() - 1, instance_id, + tablet_stat_ptr->idx().tablet_id(), code, msg, &resp); + if (code != MetaServiceCode::OK) { + st.set_code(code); + st.set_msg(msg); + return st; + } + int64_t total_disk_size = 0; + for (const auto& rs_meta : resp.rowset_meta()) { + rs_meta.rowset_id(); + total_disk_size += rs_meta.total_disk_size(); + } + + // set new disk size to tabletPB and write it back + TabletStatsPB tablet_stat; + tablet_stat.CopyFrom(*tablet_stat_ptr); + tablet_stat.set_data_size(total_disk_size); + // record tablet stats batch + tablet_stat_shared_ptr_vec_batch.emplace_back(std::make_shared<TabletStatsPB>(tablet_stat)); + std::string tablet_stat_key; + std::string tablet_stat_value; + tablet_stat_key = stats_tablet_key( + {instance_id, tablet_stat.idx().table_id(), tablet_stat.idx().index_id(), + tablet_stat.idx().partition_id(), tablet_stat.idx().tablet_id()}); + if (!tablet_stat.SerializeToString(&tablet_stat_value)) { + st.set_code(MetaServiceCode::PROTOBUF_SERIALIZE_ERR); + st.set_msg("failed to serialize tablet stat"); + return st; + } + txn->put(tablet_stat_key, tablet_stat_value); + + // read num segs + // 0x01 "stats" ${instance_id} "tablet" ${table_id} ${index_id} ${partition_id} ${tablet_id} "num_segs" -> int64 + std::string tablet_stat_num_segs_key; + stats_tablet_num_segs_key( + {instance_id, tablet_stat_ptr->idx().table_id(), tablet_stat_ptr->idx().index_id(), + tablet_stat_ptr->idx().partition_id(), tablet_stat_ptr->idx().tablet_id()}, + &tablet_stat_num_segs_key); + int64_t tablet_stat_num_segs = 0; + std::string tablet_stat_num_segs_value(sizeof(tablet_stat_num_segs), '\0'); + err = txn->get(tablet_stat_num_segs_key, &tablet_stat_num_segs_value); + if (err != TxnErrorCode::TXN_OK && err != TxnErrorCode::TXN_KEY_NOT_FOUND) { + st.set_code(cast_as<ErrCategory::READ>(err)); + } + if (tablet_stat_num_segs_value.size() != sizeof(tablet_stat_num_segs)) [[unlikely]] { + LOG(WARNING) << " malformed tablet stats value v.size=" + << tablet_stat_num_segs_value.size() + << " value=" << hex(tablet_stat_num_segs_value); + } + std::memcpy(&tablet_stat_num_segs, tablet_stat_num_segs_value.data(), + sizeof(tablet_stat_num_segs)); + if constexpr (std::endian::native == std::endian::big) { + tablet_stat_num_segs = bswap_64(tablet_stat_num_segs); + } + + if (tablet_stat_num_segs > 0) { + // set tablet stats data size = 0 + // 0x01 "stats" ${instance_id} "tablet" ${table_id} ${index_id} ${partition_id} ${tablet_id} "data_size" -> int64 + std::string tablet_stat_data_size_key; + stats_tablet_data_size_key( + {instance_id, tablet_stat.idx().table_id(), tablet_stat.idx().index_id(), + tablet_stat.idx().partition_id(), tablet_stat.idx().tablet_id()}, + &tablet_stat_data_size_key); + int64_t tablet_stat_data_size = 0; + std::string tablet_stat_data_size_value(sizeof(tablet_stat_data_size), '\0'); + memcpy(tablet_stat_data_size_value.data(), &tablet_stat_data_size, + sizeof(tablet_stat_data_size)); + txn->put(tablet_stat_data_size_key, tablet_stat_data_size_value); + } + } + + err = txn->commit(); + if (err != TxnErrorCode::TXN_OK) { + st.set_code(cast_as<ErrCategory::COMMIT>(err)); + st.set_msg("failed to commit txn"); + return st; + } + return st; +} + +MetaServiceResponseStatus check_new_tablet_stats( + std::shared_ptr<TxnKv> txn_kv, const std::string& instance_id, + const std::vector<std::shared_ptr<TabletStatsPB>>& tablet_stat_shared_ptr_vec_batch) { + std::unique_ptr<Transaction> txn; + MetaServiceResponseStatus st; + st.set_code(MetaServiceCode::OK); + + TxnErrorCode err = txn_kv->create_txn(&txn); + if (err != TxnErrorCode::TXN_OK) { + st.set_code(cast_as<ErrCategory::CREATE>(err)); + st.set_msg("failed to create txn"); + return st; + } + + for (const auto& tablet_stat_ptr : tablet_stat_shared_ptr_vec_batch) { + // check tablet stats + std::string tablet_stat_key; + std::string tablet_stat_value; + tablet_stat_key = stats_tablet_key( + {instance_id, tablet_stat_ptr->idx().table_id(), tablet_stat_ptr->idx().index_id(), + tablet_stat_ptr->idx().partition_id(), tablet_stat_ptr->idx().tablet_id()}); + err = txn->get(tablet_stat_key, &tablet_stat_value); + if (err != TxnErrorCode::TXN_OK && err != TxnErrorCode::TXN_KEY_NOT_FOUND) { + st.set_code(cast_as<ErrCategory::READ>(err)); + } + TabletStatsPB tablet_stat_check; + tablet_stat_check.ParseFromArray(tablet_stat_value.data(), tablet_stat_value.size()); + if (tablet_stat_check.DebugString() != tablet_stat_ptr->DebugString() && + // If anyone data size of tablet_stat_check and tablet_stat_ptr is twice bigger than another, + // we need to rewrite it this tablet_stat. + (tablet_stat_check.data_size() > 2 * tablet_stat_ptr->data_size() || + tablet_stat_ptr->data_size() > 2 * tablet_stat_check.data_size())) { + LOG_WARNING("[fix tablet stats]:tablet stats check failed") + .tag("tablet stat", tablet_stat_ptr->DebugString()) + .tag("check tabelt stat", tablet_stat_check.DebugString()); + } + + // check data size + std::string tablet_stat_data_size_key; + stats_tablet_data_size_key( + {instance_id, tablet_stat_ptr->idx().table_id(), tablet_stat_ptr->idx().index_id(), + tablet_stat_ptr->idx().partition_id(), tablet_stat_ptr->idx().tablet_id()}, + &tablet_stat_data_size_key); + int64_t tablet_stat_data_size = 0; + std::string tablet_stat_data_size_value(sizeof(tablet_stat_data_size), '\0'); + err = txn->get(tablet_stat_data_size_key, &tablet_stat_data_size_value); + if (err != TxnErrorCode::TXN_OK && err != TxnErrorCode::TXN_KEY_NOT_FOUND) { + st.set_code(cast_as<ErrCategory::READ>(err)); Review Comment: should not continue -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org