This is an automated email from the ASF dual-hosted git repository. dlmarion pushed a commit to branch 2.1 in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/2.1 by this push: new df053cc63e Manager balancer fixes (#5070) df053cc63e is described below commit df053cc63e6e998c1ca264d5078cbd2337759356 Author: Dave Marion <dlmar...@apache.org> AuthorDate: Tue Dec 3 12:50:26 2024 -0500 Manager balancer fixes (#5070) Modified Manager balancer code such that the tservers for the ROOT and METADATA DataLevels are recalculated on each loop to account for any change in available tablet servers, and ignoring any migrations that the balancer may emit for tablets outside of the current DataLevel. --- .../java/org/apache/accumulo/manager/Manager.java | 38 ++++++++++++++++++---- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java index 9aca12d7ab..44800d5833 100644 --- a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java +++ b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java @@ -1045,7 +1045,7 @@ public class Manager extends AbstractServer } // Create a view of the tserver status such that it only contains the tables // for this level in the tableMap. - final SortedMap<TServerInstance,TabletServerStatus> tserverStatusForLevel = + SortedMap<TServerInstance,TabletServerStatus> tserverStatusForLevel = createTServerStatusView(dl, tserverStatus); // Construct the Thrift variant of the map above for the BalancerParams final SortedMap<TabletServerId,TServerStatus> tserverStatusForBalancerLevel = @@ -1057,17 +1057,36 @@ public class Manager extends AbstractServer int attemptNum = 0; do { log.debug("Balancing for tables at level {}, times-in-loop: {}", dl, ++attemptNum); - params = BalanceParamsImpl.fromThrift(tserverStatusForBalancerLevel, - tserverStatusForLevel, partitionedMigrations.get(dl)); + + SortedMap<TabletServerId,TServerStatus> statusForBalancerLevel = + tserverStatusForBalancerLevel; + if (attemptNum > 1 && (dl == DataLevel.ROOT || dl == DataLevel.METADATA)) { + // If we are still migrating then perform a re-check on the tablet + // servers to make sure non of them have failed. + Set<TServerInstance> currentServers = tserverSet.getCurrentServers(); + tserverStatus = gatherTableInformation(currentServers); + // Create a view of the tserver status such that it only contains the tables + // for this level in the tableMap. + tserverStatusForLevel = createTServerStatusView(dl, tserverStatus); + final SortedMap<TabletServerId,TServerStatus> tserverStatusForBalancerLevel2 = + new TreeMap<>(); + tserverStatusForLevel.forEach((tsi, status) -> tserverStatusForBalancerLevel2 + .put(new TabletServerIdImpl(tsi), TServerStatusImpl.fromThrift(status))); + statusForBalancerLevel = tserverStatusForBalancerLevel2; + } + + params = BalanceParamsImpl.fromThrift(statusForBalancerLevel, tserverStatusForLevel, + partitionedMigrations.get(dl)); wait = Math.max(tabletBalancer.balance(params), wait); - migrationsOutForLevel = params.migrationsOut().size(); - for (TabletMigration m : checkMigrationSanity(tserverStatusForBalancerLevel.keySet(), - params.migrationsOut())) { + migrationsOutForLevel = 0; + for (TabletMigration m : checkMigrationSanity(statusForBalancerLevel.keySet(), + params.migrationsOut(), dl)) { final KeyExtent ke = KeyExtent.fromTabletId(m.getTablet()); if (migrations.containsKey(ke)) { log.warn("balancer requested migration more than once, skipping {}", m); continue; } + migrationsOutForLevel++; migrations.put(ke, TabletServerIdImpl.toThrift(m.getNewTabletServer())); log.debug("migration {}", m); } @@ -1091,11 +1110,16 @@ public class Manager extends AbstractServer } private List<TabletMigration> checkMigrationSanity(Set<TabletServerId> current, - List<TabletMigration> migrations) { + List<TabletMigration> migrations, DataLevel level) { return migrations.stream().filter(m -> { boolean includeMigration = false; if (m.getTablet() == null) { log.error("Balancer gave back a null tablet {}", m); + } else if (DataLevel.of(m.getTablet().getTable()) != level) { + log.trace( + "Balancer wants to move a tablet ({}) outside of the current processing level ({}), " + + "ignoring and should be processed at the correct level ({})", + m.getTablet(), level, DataLevel.of(m.getTablet().getTable())); } else if (m.getNewTabletServer() == null) { log.error("Balancer did not set the destination {}", m); } else if (m.getOldTabletServer() == null) {