This is an automated email from the ASF dual-hosted git repository.

dlmarion pushed a commit to branch 2.1
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/2.1 by this push:
     new df053cc63e Manager balancer fixes (#5070)
df053cc63e is described below

commit df053cc63e6e998c1ca264d5078cbd2337759356
Author: Dave Marion <dlmar...@apache.org>
AuthorDate: Tue Dec 3 12:50:26 2024 -0500

    Manager balancer fixes (#5070)
    
    Modified Manager balancer code such that the tservers for the
    ROOT and METADATA DataLevels are recalculated on each loop
    to account for any change in available tablet servers, and ignoring
    any migrations that the balancer may emit for tablets outside of
    the current DataLevel.
---
 .../java/org/apache/accumulo/manager/Manager.java  | 38 ++++++++++++++++++----
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git 
a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java 
b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java
index 9aca12d7ab..44800d5833 100644
--- a/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java
+++ b/server/manager/src/main/java/org/apache/accumulo/manager/Manager.java
@@ -1045,7 +1045,7 @@ public class Manager extends AbstractServer
         }
         // Create a view of the tserver status such that it only contains the 
tables
         // for this level in the tableMap.
-        final SortedMap<TServerInstance,TabletServerStatus> 
tserverStatusForLevel =
+        SortedMap<TServerInstance,TabletServerStatus> tserverStatusForLevel =
             createTServerStatusView(dl, tserverStatus);
         // Construct the Thrift variant of the map above for the BalancerParams
         final SortedMap<TabletServerId,TServerStatus> 
tserverStatusForBalancerLevel =
@@ -1057,17 +1057,36 @@ public class Manager extends AbstractServer
         int attemptNum = 0;
         do {
           log.debug("Balancing for tables at level {}, times-in-loop: {}", dl, 
++attemptNum);
-          params = BalanceParamsImpl.fromThrift(tserverStatusForBalancerLevel,
-              tserverStatusForLevel, partitionedMigrations.get(dl));
+
+          SortedMap<TabletServerId,TServerStatus> statusForBalancerLevel =
+              tserverStatusForBalancerLevel;
+          if (attemptNum > 1 && (dl == DataLevel.ROOT || dl == 
DataLevel.METADATA)) {
+            // If we are still migrating then perform a re-check on the tablet
+            // servers to make sure non of them have failed.
+            Set<TServerInstance> currentServers = 
tserverSet.getCurrentServers();
+            tserverStatus = gatherTableInformation(currentServers);
+            // Create a view of the tserver status such that it only contains 
the tables
+            // for this level in the tableMap.
+            tserverStatusForLevel = createTServerStatusView(dl, tserverStatus);
+            final SortedMap<TabletServerId,TServerStatus> 
tserverStatusForBalancerLevel2 =
+                new TreeMap<>();
+            tserverStatusForLevel.forEach((tsi, status) -> 
tserverStatusForBalancerLevel2
+                .put(new TabletServerIdImpl(tsi), 
TServerStatusImpl.fromThrift(status)));
+            statusForBalancerLevel = tserverStatusForBalancerLevel2;
+          }
+
+          params = BalanceParamsImpl.fromThrift(statusForBalancerLevel, 
tserverStatusForLevel,
+              partitionedMigrations.get(dl));
           wait = Math.max(tabletBalancer.balance(params), wait);
-          migrationsOutForLevel = params.migrationsOut().size();
-          for (TabletMigration m : 
checkMigrationSanity(tserverStatusForBalancerLevel.keySet(),
-              params.migrationsOut())) {
+          migrationsOutForLevel = 0;
+          for (TabletMigration m : 
checkMigrationSanity(statusForBalancerLevel.keySet(),
+              params.migrationsOut(), dl)) {
             final KeyExtent ke = KeyExtent.fromTabletId(m.getTablet());
             if (migrations.containsKey(ke)) {
               log.warn("balancer requested migration more than once, skipping 
{}", m);
               continue;
             }
+            migrationsOutForLevel++;
             migrations.put(ke, 
TabletServerIdImpl.toThrift(m.getNewTabletServer()));
             log.debug("migration {}", m);
           }
@@ -1091,11 +1110,16 @@ public class Manager extends AbstractServer
     }
 
     private List<TabletMigration> checkMigrationSanity(Set<TabletServerId> 
current,
-        List<TabletMigration> migrations) {
+        List<TabletMigration> migrations, DataLevel level) {
       return migrations.stream().filter(m -> {
         boolean includeMigration = false;
         if (m.getTablet() == null) {
           log.error("Balancer gave back a null tablet {}", m);
+        } else if (DataLevel.of(m.getTablet().getTable()) != level) {
+          log.trace(
+              "Balancer wants to move a tablet ({}) outside of the current 
processing level ({}), "
+                  + "ignoring and should be processed at the correct level 
({})",
+              m.getTablet(), level, DataLevel.of(m.getTablet().getTable()));
         } else if (m.getNewTabletServer() == null) {
           log.error("Balancer did not set the destination {}", m);
         } else if (m.getOldTabletServer() == null) {

Reply via email to