This is an automated email from the ASF dual-hosted git repository.

dlmarion pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/main by this push:
     new 4ab1c2eebe Monitor and Message improvments, include Tablet Recovery 
information (#6356)
4ab1c2eebe is described below

commit 4ab1c2eebe4f754a4160ccf4ca996b4b4472db24
Author: Dave Marion <[email protected]>
AuthorDate: Thu May 14 15:28:32 2026 -0400

    Monitor and Message improvments, include Tablet Recovery information (#6356)
    
    Prior to this change the InformationFetcher would create a new
    instance of a SystemInformation object while the Monitor UI
    would display the last SystemInformation object. If any of the
    tasks to create the new SystemInformation object failed, then
    the Monitor would continue to display the old information.
    
    With this change the new SystemInformation object is always
    used, and if it's incomplete then a new message is shown on
    the Messages page.
    
    This change also includes a new computeMessages method that
    is called from SystemInformation.finish. This method includes
    new messages that use new priorities and categories.
    
    Finally, this change introduces a new Tablet Recoveries page
    in the Activity menu. Information for this page is gathered
    from metrics emitted by the server processes and information
    in the tablet metadata.
    
    Closes #6306, #6106
---
 .../schema/filters/NoCurrentLocationFilter.java    |  39 ++
 .../org/apache/accumulo/core/metrics/Metric.java   |  40 +-
 .../apache/accumulo/monitor/next/Endpoints.java    |   9 +
 .../accumulo/monitor/next/InformationFetcher.java  | 543 ++++++++++++++++++---
 .../accumulo/monitor/next/SystemInformation.java   | 393 ++++++++++++++-
 .../org/apache/accumulo/monitor/view/WebViews.java |  18 +
 .../accumulo/monitor/resources/js/functions.js     |  16 +-
 .../accumulo/monitor/resources/js/manager.js       | 111 -----
 .../accumulo/monitor/resources/js/recovery.js      | 252 ++++++++++
 .../accumulo/monitor/resources/js/tservers.js      |  40 --
 .../apache/accumulo/monitor/templates/manager.ftl  |  18 -
 .../apache/accumulo/monitor/templates/navbar.ftl   |   1 +
 .../apache/accumulo/monitor/templates/recovery.ftl |  96 ++++
 .../apache/accumulo/monitor/templates/tservers.ftl |   1 -
 .../org/apache/accumulo/tserver/TabletServer.java  |   9 +-
 .../org/apache/accumulo/tserver/log/LogSorter.java |  18 +-
 .../metrics/TabletServerRecoveryMetrics.java       |  76 +++
 .../org/apache/accumulo/tserver/tablet/Tablet.java |   6 +
 18 files changed, 1416 insertions(+), 270 deletions(-)

diff --git 
a/core/src/main/java/org/apache/accumulo/core/metadata/schema/filters/NoCurrentLocationFilter.java
 
b/core/src/main/java/org/apache/accumulo/core/metadata/schema/filters/NoCurrentLocationFilter.java
new file mode 100644
index 0000000000..0b9315d648
--- /dev/null
+++ 
b/core/src/main/java/org/apache/accumulo/core/metadata/schema/filters/NoCurrentLocationFilter.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.accumulo.core.metadata.schema.filters;
+
+import java.util.Set;
+import java.util.function.Predicate;
+
+import org.apache.accumulo.core.metadata.schema.TabletMetadata;
+import org.apache.accumulo.core.metadata.schema.TabletMetadata.ColumnType;
+
+public class NoCurrentLocationFilter extends HasCurrentFilter {
+
+  @Override
+  public Set<ColumnType> getColumns() {
+    return super.getColumns();
+  }
+
+  @Override
+  protected Predicate<TabletMetadata> acceptTablet() {
+    return super.acceptTablet().negate();
+  }
+
+}
diff --git a/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java 
b/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java
index 46cfadd347..9b8d4ccc4b 100644
--- a/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java
+++ b/core/src/main/java/org/apache/accumulo/core/metrics/Metric.java
@@ -392,15 +392,30 @@ public enum Metric {
       MetricDocSection.MANAGER, "Manager Goal State", null, NUMBER),
 
   // Recovery Metrics
-  RECOVERIES_IN_PROGRESS("accumulo.recoveries.in.progress", MetricType.GAUGE,
-      "The number of recoveries in progress.", MetricDocSection.GENERAL_SERVER,
-      "Tablet Recoveries In Progress", null, NUMBER),
-  RECOVERIES_LONGEST_RUNTIME("accumulo.recoveries.runtime.longest", 
MetricType.GAUGE,
-      "The time (in milliseconds) of the longest running recovery.",
+  RECOVERIES_SORTS_IN_PROGRESS("accumulo.recoveries.sorts.in.progress", 
MetricType.GAUGE,
+      "The number of log sorts in progress.", MetricDocSection.GENERAL_SERVER,
+      "Log Sorts In Progress", null, NUMBER),
+  
RECOVERIES_SORTS_LONGEST_RUNTIME("accumulo.recoveries.sorts.runtime.longest", 
MetricType.GAUGE,
+      "The time (in milliseconds) of the longest running log sort.",
       MetricDocSection.GENERAL_SERVER, "Tablet Recovery Longest Time", null, 
DURATION),
-  RECOVERIES_AVG_PROGRESS("accumulo.recoveries.avg.progress", MetricType.GAUGE,
-      "The average percentage (0.0 - 99.9) of the in progress recoveries.",
+  RECOVERIES_SORTS_AVG_PROGRESS("accumulo.recoveries.sorts.avg.progress", 
MetricType.GAUGE,
+      "The average percentage (0.0 - 99.9) of the in progress log sorts.",
       MetricDocSection.GENERAL_SERVER, "Tablet Recovery Avg Percent Complete", 
null, PERCENT),
+  RECOVERIES_TABLETS_STARTED("accumulo.recoveries.tablets.started", 
MetricType.GAUGE,
+      "The number of tablet recoveries started", 
MetricDocSection.GENERAL_SERVER,
+      "Tablet Recoveries Started", null, NUMBER),
+  RECOVERIES_TABLETS_COMPLETED("accumulo.recoveries.tablets.completed", 
MetricType.GAUGE,
+      "The number of tablet recoveries completed", 
MetricDocSection.GENERAL_SERVER,
+      "Tablet Recoveries Completed", null, NUMBER),
+  RECOVERIES_TABLETS_FAILED("accumulo.recoveries.tablets.failed", 
MetricType.GAUGE,
+      "The number of tablet recoveries failed", 
MetricDocSection.GENERAL_SERVER,
+      "Tablet Recoveries Failed", null, NUMBER),
+  RECOVERIES_TABLETS_IN_PROGRESS("accumulo.recoveries.tablets.in.progress", 
MetricType.GAUGE,
+      "The number of tablet recoveries in progress", 
MetricDocSection.GENERAL_SERVER,
+      "Tablet Recoveries In Progress", null, NUMBER),
+  
RECOVERIES_TABLETS_MUTATIONS_REPLAYED("accumulo.recoveries.tablets.mutations.replayed",
+      MetricType.GAUGE, "The number of mutations replayed for tablet recovery",
+      MetricDocSection.GENERAL_SERVER, "Tablet Recoveries Mutations Replayed", 
null, NUMBER),
 
   // Executor metrics
   EXECUTOR_COMPLETED("executor.completed", MetricType.FUNCTION_COUNTER,
@@ -555,8 +570,10 @@ public enum Metric {
   public static Set<String> getMonitorExclusions(ServerId.Type serverType) {
     switch (serverType) {
       case COMPACTOR:
-        return Set.of(MINC_PAUSED.getName(), RECOVERIES_AVG_PROGRESS.getName(),
-            RECOVERIES_IN_PROGRESS.getName(), 
RECOVERIES_LONGEST_RUNTIME.getName());
+        return Set.of(MINC_PAUSED.getName(), 
RECOVERIES_TABLETS_STARTED.getName(),
+            RECOVERIES_TABLETS_COMPLETED.getName(), 
RECOVERIES_TABLETS_FAILED.getName(),
+            RECOVERIES_TABLETS_IN_PROGRESS.getName(),
+            RECOVERIES_TABLETS_MUTATIONS_REPLAYED.getName());
       case GARBAGE_COLLECTOR:
         return Set.of();
       case MANAGER:
@@ -564,8 +581,9 @@ public enum Metric {
       case MONITOR:
         return Set.of();
       case SCAN_SERVER:
-        return Set.of(RECOVERIES_AVG_PROGRESS.getName(), 
RECOVERIES_IN_PROGRESS.getName(),
-            RECOVERIES_LONGEST_RUNTIME.getName());
+        return Set.of(RECOVERIES_TABLETS_STARTED.getName(), 
RECOVERIES_TABLETS_COMPLETED.getName(),
+            RECOVERIES_TABLETS_FAILED.getName(), 
RECOVERIES_TABLETS_IN_PROGRESS.getName(),
+            RECOVERIES_TABLETS_MUTATIONS_REPLAYED.getName());
       case TABLET_SERVER:
         return Set.of(MAJC_PAUSED.getName());
       default:
diff --git 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/Endpoints.java 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/Endpoints.java
index 360829b328..72dfc4a136 100644
--- 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/Endpoints.java
+++ 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/Endpoints.java
@@ -59,6 +59,7 @@ import 
org.apache.accumulo.monitor.next.SystemInformation.CompactionGroupSummary
 import 
org.apache.accumulo.monitor.next.SystemInformation.CompactionTableSummary;
 import org.apache.accumulo.monitor.next.SystemInformation.MessageCategory;
 import org.apache.accumulo.monitor.next.SystemInformation.MessagePriority;
+import org.apache.accumulo.monitor.next.SystemInformation.RecoveryInformation;
 import org.apache.accumulo.monitor.next.SystemInformation.TableSummary;
 import 
org.apache.accumulo.monitor.next.SystemInformation.TimeOrderedRunningCompactionSet;
 import org.apache.accumulo.monitor.next.deployment.DeploymentOverview;
@@ -419,6 +420,14 @@ public class Endpoints {
     return ti;
   }
 
+  @GET
+  @Path("recovery")
+  @Produces(MediaType.APPLICATION_JSON)
+  @Description("Returns information about tservers performing recovery and 
tablets needing recovery")
+  public RecoveryInformation getTabletRecoveries() {
+    return 
monitor.getInformationFetcher().getSummaryForEndpoint().getRecoveryInformation();
+  }
+
   @GET
   @Path("deployment")
   @Produces(MediaType.APPLICATION_JSON)
diff --git 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
index 666bfe3296..5e0aec350b 100644
--- 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
+++ 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
@@ -18,18 +18,27 @@
  */
 package org.apache.accumulo.monitor.next;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static java.util.concurrent.TimeUnit.SECONDS;
+import static 
org.apache.accumulo.monitor.next.SystemInformation.MessageCategory.Monitor;
+import static 
org.apache.accumulo.monitor.next.SystemInformation.MessageCategory.Table;
+import static 
org.apache.accumulo.monitor.next.SystemInformation.MessagePriority.Critical;
+import static 
org.apache.accumulo.monitor.next.SystemInformation.MessagePriority.Info;
 
 import java.time.Duration;
 import java.util.ArrayList;
+import java.util.Comparator;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Objects;
 import java.util.Set;
 import java.util.concurrent.CancellationException;
+import java.util.concurrent.ConcurrentSkipListSet;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Supplier;
 import java.util.stream.Stream;
@@ -44,6 +53,17 @@ import 
org.apache.accumulo.core.client.admin.servers.ServerId.Type;
 import org.apache.accumulo.core.conf.Property;
 import org.apache.accumulo.core.data.RowRange;
 import org.apache.accumulo.core.data.TableId;
+import org.apache.accumulo.core.lock.ServiceLockPaths.AddressSelector;
+import org.apache.accumulo.core.lock.ServiceLockPaths.ResourceGroupPredicate;
+import org.apache.accumulo.core.lock.ServiceLockPaths.ServiceLockPath;
+import org.apache.accumulo.core.metadata.RootTable;
+import org.apache.accumulo.core.metadata.SystemTables;
+import org.apache.accumulo.core.metadata.schema.RootTabletMetadata;
+import org.apache.accumulo.core.metadata.schema.TabletMetadata;
+import org.apache.accumulo.core.metadata.schema.TabletMetadata.ColumnType;
+import org.apache.accumulo.core.metadata.schema.TabletMetadata.Location;
+import 
org.apache.accumulo.core.metadata.schema.filters.NoCurrentLocationFilter;
+import org.apache.accumulo.core.metadata.schema.filters.TabletMetadataFilter;
 import org.apache.accumulo.core.process.thrift.MetricResponse;
 import org.apache.accumulo.core.process.thrift.ServerProcessService.Client;
 import org.apache.accumulo.core.rpc.ThriftUtil;
@@ -108,16 +128,126 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
     }
   }
 
-  private class MetricFetcher implements Runnable {
+  record UpdateTaskFuture(Future<?> future, UpdateTask<?> task) {
+  }
+
+  static class UpdateTasks {
+
+    private final Comparator<UpdateTaskFuture> c = new Comparator<>() {
+
+      @Override
+      public int compare(UpdateTaskFuture o1, UpdateTaskFuture o2) {
+        if (o1.future() == o2.future()) {
+          return 0;
+        } else {
+          if (Objects.equals(o1.task(), o2.task())) {
+            return 0;
+          } else {
+            return Integer.compare(o1.task().hashCode(), o2.task().hashCode());
+          }
+        }
+      }
+
+    };
+    private final ConcurrentSkipListSet<UpdateTaskFuture> futures = new 
ConcurrentSkipListSet<>(c);
+    private final AtomicBoolean stopTables = new AtomicBoolean(false);
+
+    boolean isEmpty() {
+      return futures.isEmpty();
+    }
+
+    Iterator<UpdateTaskFuture> iterator() {
+      return futures.iterator();
+    }
+
+    int size() {
+      return futures.size();
+    }
+
+    void add(UpdateTaskFuture f) {
+      if (stopTables.get() && f.task().getType() == UpdateType.TABLE) {
+        return;
+      }
+      futures.add(f);
+    }
+  }
+
+  enum UpdateType {
+    COMPACTION, COMPACTION_RGS, METRIC, TABLE;
+  }
+
+  interface UpdateTask<T extends Object> extends Runnable, 
Comparable<UpdateTask<T>> {
+
+    UpdateType getType();
+
+    T getResource();
+
+    String getFailureMessage();
+
+  }
+
+  class MetricFetcher implements UpdateTask<ServerId> {
 
     private final ServerContext ctx;
     private final ServerId server;
     private final SystemInformation summary;
+    private final UpdateTasks tasks;
 
-    private MetricFetcher(ServerContext ctx, ServerId server, 
SystemInformation summary) {
+    private MetricFetcher(ServerContext ctx, ServerId server, 
SystemInformation summary,
+        UpdateTasks tasks) {
       this.ctx = ctx;
       this.server = server;
       this.summary = summary;
+      this.tasks = tasks;
+    }
+
+    @Override
+    public int hashCode() {
+      final int prime = 31;
+      int result = 1;
+      result = prime * result + Objects.hash(getType());
+      result = prime * result + Objects.hash(getResource());
+      return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      if (this == obj) {
+        return true;
+      }
+      if (obj == null) {
+        return false;
+      }
+      if (getClass() != obj.getClass()) {
+        return false;
+      }
+      MetricFetcher other = (MetricFetcher) obj;
+      return Objects.equals(getType(), other.getType())
+          && Objects.equals(getResource(), other.getResource());
+    }
+
+    @Override
+    public int compareTo(UpdateTask<ServerId> other) {
+      int result = this.getType().compareTo(other.getType());
+      if (result == 0) {
+        result = getResource().compareTo(other.getResource());
+      }
+      return result;
+    }
+
+    @Override
+    public UpdateType getType() {
+      return UpdateType.METRIC;
+    }
+
+    @Override
+    public ServerId getResource() {
+      return server;
+    }
+
+    @Override
+    public String getFailureMessage() {
+      return "Failed to get metrics from server: " + server;
     }
 
     @Override
@@ -128,7 +258,7 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
         try {
           MetricResponse response = 
metricsClient.getMetrics(TraceUtil.traceInfo(), ctx.rpcCreds());
           retainedProblemServers.invalidate(server);
-          summary.processResponse(server, response);
+          summary.processResponse(server, response, tasks);
         } finally {
           ThriftUtil.returnClient(metricsClient, ctx);
         }
@@ -138,10 +268,9 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
         summary.processMetricsError(server);
       }
     }
-
   }
 
-  private class TableInformationFetcher implements Runnable {
+  class TableInformationFetcher implements UpdateTask<TableId> {
     private final ServerContext ctx;
     private final TableId tableId;
     private final SystemInformation summary;
@@ -152,6 +281,55 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
       this.summary = summary;
     }
 
+    @Override
+    public int hashCode() {
+      final int prime = 31;
+      int result = 1;
+      result = prime * result + Objects.hash(getType());
+      result = prime * result + Objects.hash(getResource());
+      return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      if (this == obj) {
+        return true;
+      }
+      if (obj == null) {
+        return false;
+      }
+      if (getClass() != obj.getClass()) {
+        return false;
+      }
+      TableInformationFetcher other = (TableInformationFetcher) obj;
+      return Objects.equals(getType(), other.getType())
+          && Objects.equals(getResource(), other.getResource());
+    }
+
+    @Override
+    public int compareTo(UpdateTask<TableId> other) {
+      int result = this.getType().compareTo(other.getType());
+      if (result == 0) {
+        result = getResource().compareTo(other.getResource());
+      }
+      return result;
+    }
+
+    @Override
+    public UpdateType getType() {
+      return UpdateType.TABLE;
+    }
+
+    @Override
+    public TableId getResource() {
+      return tableId;
+    }
+
+    @Override
+    public String getFailureMessage() {
+      return "Failed to get information for table: " + tableId;
+    }
+
     @Override
     public void run() {
       try {
@@ -169,7 +347,7 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
     }
   }
 
-  private class RunningCompactionFetcher implements Runnable {
+  class RunningCompactionFetcher implements UpdateTask<Void> {
 
     private final SystemInformation summary;
     private final ThreadPoolExecutor executor;
@@ -179,6 +357,49 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
       this.executor = executor;
     }
 
+    @Override
+    public int hashCode() {
+      final int prime = 31;
+      int result = 1;
+      result = prime * result + Objects.hash(getType());
+      return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      if (this == obj) {
+        return true;
+      }
+      if (obj == null) {
+        return false;
+      }
+      if (getClass() != obj.getClass()) {
+        return false;
+      }
+      RunningCompactionFetcher other = (RunningCompactionFetcher) obj;
+      return Objects.equals(getType(), other.getType());
+    }
+
+    @Override
+    public int compareTo(UpdateTask<Void> other) {
+      return this.getType().compareTo(other.getType());
+    }
+
+    @Override
+    public UpdateType getType() {
+      return UpdateType.COMPACTION;
+    }
+
+    @Override
+    public Void getResource() {
+      return null;
+    }
+
+    @Override
+    public String getFailureMessage() {
+      return "Failed to get running compactions";
+    }
+
     @Override
     public void run() {
       try {
@@ -191,6 +412,70 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
     }
   }
 
+  class ConfiguredCompactionResourceGroupFetcher implements UpdateTask<Void> {
+
+    private final SystemInformation summary;
+
+    public ConfiguredCompactionResourceGroupFetcher(SystemInformation summary) 
{
+      this.summary = summary;
+    }
+
+    @Override
+    public void run() {
+      try {
+        summary.addConfiguredCompactionGroups(
+            CompactionPluginUtils.getConfiguredCompactionResourceGroups(ctx));
+      } catch (ReflectiveOperationException e) {
+        throw new IllegalStateException(e);
+      }
+    }
+
+    @Override
+    public int hashCode() {
+      final int prime = 31;
+      int result = 1;
+      result = prime * result + Objects.hash(getType());
+      return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      if (this == obj) {
+        return true;
+      }
+      if (obj == null) {
+        return false;
+      }
+      if (getClass() != obj.getClass()) {
+        return false;
+      }
+      ConfiguredCompactionResourceGroupFetcher other =
+          (ConfiguredCompactionResourceGroupFetcher) obj;
+      return Objects.equals(getType(), other.getType());
+    }
+
+    @Override
+    public int compareTo(UpdateTask<Void> other) {
+      return this.getType().compareTo(other.getType());
+    }
+
+    @Override
+    public UpdateType getType() {
+      return UpdateType.COMPACTION_RGS;
+    }
+
+    @Override
+    public Void getResource() {
+      return null;
+    }
+
+    @Override
+    public String getFailureMessage() {
+      return "Error fetching configured compaction resource groups";
+    }
+
+  }
+
   private final String poolName = "MonitorMetricsThreadPool";
   private final ThreadPoolExecutor pool = ThreadPools.getServerThreadPools()
       .getPoolBuilder(poolName).numCoreThreads(10).withTimeOut(30, 
SECONDS).build();
@@ -201,6 +486,7 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
   private final Cache<ServerId,MetricResponse> allMetrics;
   private final Cache<ServerId,Boolean> retainedProblemServers;
   private final AtomicReference<SystemInformation> summaryRef = new 
AtomicReference<>();
+  private final TabletMetadataFilter noLocation = new 
NoCurrentLocationFilter();
 
   public InformationFetcher(ServerContext ctx, Supplier<Long> connectionCount) 
{
     this.ctx = ctx;
@@ -256,11 +542,100 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
     }
   }
 
+  /**
+   * Obtains a count of the metadata tablets with no location. This work is 
done in a Thread because
+   * the Scanner used by Ample will sit and wait for the tablets to be hosted.
+   *
+   * @return count of metadata tablets with no location
+   */
+  private long countMetadataTabletsNoLocation() {
+    // If any Metadata tablet is not hosted, then don't look for table 
information
+    // on other tables.
+    AtomicLong metadataNoLocation = new AtomicLong(0);
+    // This is a background task because the tserver could go down and
+    // the scanner inside Ample will sit there and wait.
+    Runnable countTask = () -> {
+      
metadataNoLocation.set(ctx.getAmple().readTablets().forTable(SystemTables.METADATA.tableId())
+          
.fetch(ColumnType.LOCATION).filter(noLocation).build().stream().count());
+    };
+    Thread countThread = new Thread(countTask, 
"Metadata-Tablets-Location-Thread");
+    countThread.start();
+    try {
+      countThread.join(30_000);
+    } catch (InterruptedException e) {
+      throw new RuntimeException(
+          "Interrupted while waiting for thread counting metadata tablet 
locations");
+    }
+    if (countThread.isAlive()) {
+      countThread.interrupt();
+    }
+    return metadataNoLocation.get();
+  }
+
+  /**
+   * Validates that tablet location is a tablet server that is alive and has 
its lock
+   *
+   * @param location TabletMetadata Location
+   * @return true if location is valid
+   */
+  private boolean isLocationValid(Location location) {
+    if (location != null) {
+      // Verify location is alive
+      Set<ServiceLockPath> servers = ctx.getServerPaths().getTabletServer(
+          ResourceGroupPredicate.ANY, 
AddressSelector.exact(location.getHostAndPort()), true);
+      if (servers != null && !servers.isEmpty()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private void fetchTabletInformation(SystemInformation summary, UpdateTasks 
futures,
+      Location rootTabletLocation) {
+
+    // Fetch information about the root tablet, this is pulled from ZK so
+    // it doesn't depend on the tablet being hosted.
+    TableInformationFetcher rtif =
+        new TableInformationFetcher(this.ctx, SystemTables.ROOT.tableId(), 
summary);
+    Future<?> rtiff = this.pool.submit(rtif);
+    futures.add(new UpdateTaskFuture(rtiff, rtif));
+
+    if (isLocationValid(rootTabletLocation)) {
+
+      // If the root tablet is hosted, then we can ask about the metadata table
+      TableInformationFetcher tif =
+          new TableInformationFetcher(this.ctx, 
SystemTables.METADATA.tableId(), summary);
+      Future<?> tiff = this.pool.submit(tif);
+      futures.add(new UpdateTaskFuture(tiff, tif));
+
+      final long metadataNoLocation = countMetadataTabletsNoLocation();
+      if (metadataNoLocation == 0) {
+
+        // If the metadata table is fully hosted, then we can ask about
+        // all of the other tables
+        for (TableId tableId : 
this.ctx.createQualifiedTableNameToIdMap().values()) {
+          if (tableId.equals(SystemTables.ROOT.tableId())
+              || tableId.equals(SystemTables.METADATA.tableId())) {
+            continue; // we already spawned a task
+          }
+          tif = new TableInformationFetcher(this.ctx, tableId, summary);
+          tiff = this.pool.submit(tif);
+          futures.add(new UpdateTaskFuture(tiff, tif));
+        }
+
+      } else {
+        summary.addMessage(Critical, Table,
+            metadataNoLocation + " metadata tablets are not hosted");
+      }
+    } else {
+      summary.addMessage(Critical, Table, "The root tablet is not currently 
hosted");
+    }
+  }
+
   @Override
   public void run() {
 
     long lastRunTime = 0;
-
     while (true) {
 
       // Don't fetch new data if there are no connections.
@@ -282,98 +657,150 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
 
       LOG.info("Fetching information from servers");
 
-      final List<Future<?>> futures = new ArrayList<>();
+      final UpdateTasks futures = new UpdateTasks();
       final SystemInformation summary = new SystemInformation(allMetrics, 
this.ctx);
       Set<ServerId> compactors = 
this.ctx.instanceOperations().getServers(Type.COMPACTOR);
       summary.processExternalCompactionInventory(compactors);
 
+      // Fetch metrics from the other server processes. This
+      // makes an RPC call to AbstractServer.getMetrics
       for (ServerId.Type type : ServerId.Type.values()) {
         if (type == Type.MONITOR) {
           continue;
         }
         for (ServerId server : this.ctx.instanceOperations().getServers(type)) 
{
-          futures.add(this.pool.submit(new MetricFetcher(this.ctx, server, 
summary)));
+          MetricFetcher mf = new MetricFetcher(this.ctx, server, summary, 
futures);
+          Future<?> mff = this.pool.submit(mf);
+          futures.add(new UpdateTaskFuture(mff, mf));
         }
       }
       ThreadPools.resizePool(pool, () -> Math.max(20, (futures.size() / 20)), 
poolName);
 
       // Fetch external compaction information from the Compactors
-      futures.add(this.pool.submit(new RunningCompactionFetcher(summary, 
pool)));
+      RunningCompactionFetcher rcf = new RunningCompactionFetcher(summary, 
pool);
+      Future<?> rcff = this.pool.submit(rcf);
+      futures.add(new UpdateTaskFuture(rcff, rcf));
 
-      // Fetch Tablet / Tablet information from the metadata table
-      for (TableId tableId : 
this.ctx.createQualifiedTableNameToIdMap().values()) {
-        futures.add(this.pool.submit(new TableInformationFetcher(this.ctx, 
tableId, summary)));
-      }
+      final TabletMetadata rootTabletMetadata =
+          new RootTabletMetadata(new 
String(ctx.getZooCache().get(RootTable.ZROOT_TABLET), UTF_8))
+              .toTabletMetadata();
+      final Location rootTabletLocation = rootTabletMetadata.getLocation();
+      fetchTabletInformation(summary, futures, rootTabletLocation);
 
-      futures.add(this.pool.submit(() -> {
-        try {
-          var groups = 
CompactionPluginUtils.getConfiguredCompactionResourceGroups(ctx);
-          summary.addConfiguredCompactionGroups(groups);
-        } catch (ReflectiveOperationException e) {
-          throw new IllegalStateException(e);
-        }
-      }));
+      ConfiguredCompactionResourceGroupFetcher r =
+          new ConfiguredCompactionResourceGroupFetcher(summary);
+      Future<?> f = this.pool.submit(r);
+      futures.add(new UpdateTaskFuture(f, r));
 
       final long monitorFetchTimeout =
           
ctx.getConfiguration().getTimeInMillis(Property.MONITOR_FETCH_TIMEOUT);
       final long allFuturesAdded = NanoTime.now();
       boolean tookToLong = false;
+
+      final List<UpdateTaskFuture> failures = new ArrayList<>();
+      final List<UpdateTaskFuture> cancelled = new ArrayList<>();
+      boolean firstIteration = true;
       while (!futures.isEmpty()) {
 
         if (NanoTime.millisElapsed(allFuturesAdded, NanoTime.now()) > 
monitorFetchTimeout) {
-          LOG.warn(
-              "Fetching information for Monitor has taken longer {}. 
Cancelling all"
-                  + " remaining tasks and monitor will display old 
information. Resolve issue"
-                  + " causing this or increase property {}.",
-              monitorFetchTimeout, Property.MONITOR_FETCH_TIMEOUT.getKey());
+          String message =
+              "Fetching information for Monitor has taken longer than %1$d ms. 
Cancelling all remaining tasks (%2$d) "
+                  + "and monitor will display old information. Resolve issue 
causing this or increase property %3$s.";
+          LOG.warn(String.format(message, monitorFetchTimeout, futures.size(),
+              Property.MONITOR_FETCH_TIMEOUT.getKey()));
           tookToLong = true;
         }
 
-        Iterator<Future<?>> iter = futures.iterator();
+        boolean isRootLocationValid = isLocationValid(rootTabletLocation);
+        long unhostedMetadataTabletCount = 1;
+        if (isRootLocationValid) {
+          unhostedMetadataTabletCount = countMetadataTabletsNoLocation();
+        }
+        Iterator<UpdateTaskFuture> iter = futures.iterator();
         while (iter.hasNext()) {
-          Future<?> future = iter.next();
-          if (tookToLong && !future.isCancelled()) {
-            future.cancel(true);
-          } else if (future.isDone()) {
+
+          UpdateTaskFuture future = iter.next();
+
+          if (future.future().isDone()) {
             iter.remove();
             try {
-              future.get();
-            } catch (CancellationException | InterruptedException | 
ExecutionException e) {
+              future.future().get();
+            } catch (CancellationException e) {
+              if (!tookToLong) {
+                cancelled.add(future);
+              }
+            } catch (InterruptedException | ExecutionException e) {
+              failures.add(future);
               LOG.error("Error getting status from future", e);
             }
+          } else if 
(future.task().getClass().equals(TableInformationFetcher.class)
+              && (!isRootLocationValid || unhostedMetadataTabletCount > 0)) {
+            TableInformationFetcher task = (TableInformationFetcher) 
future.task();
+            TableId tid = task.getResource();
+            if (!isRootLocationValid && 
!tid.equals(SystemTables.ROOT.tableId())) {
+              LOG.warn(
+                  "Cancelling TableInformationFetcher tasks for non-root 
tables as root tablet is unhosted. {}",
+                  future.task().getFailureMessage());
+              future.future().cancel(true);
+              cancelled.add(future);
+            } else if (unhostedMetadataTabletCount > 0 && 
(!tid.equals(SystemTables.ROOT.tableId())
+                && !tid.equals(SystemTables.METADATA.tableId()))) {
+              LOG.warn(
+                  "Cancelling TableInformationFetcher tasks for user tables as 
metadata tablet has {} unhosted tablets. {}",
+                  unhostedMetadataTabletCount, 
future.task().getFailureMessage());
+              future.future().cancel(true);
+              cancelled.add(future);
+            }
+          } else if (tookToLong && !future.future().isCancelled()) {
+            LOG.warn("Cancelling task as it took too long. {}", 
future.task().getFailureMessage());
+            future.future().cancel(true);
+            cancelled.add(future);
           }
         }
+        if (!firstIteration) {
+          // Update current messages on the Monitor that we are
+          // waiting on tasks to complete to complete a refresh
+          final String waitingMsg = "Waiting on " + futures.size()
+              + " tasks to complete. Time remaining before cancellation: "
+              + (monitorFetchTimeout - NanoTime.millisElapsed(allFuturesAdded, 
NanoTime.now()))
+                  / 1000
+              + " seconds";
+          SystemInformation currentSummary = summaryRef.get();
+          if (currentSummary != null) {
+            currentSummary.removeMessage(Info, Monitor,
+                " tasks to complete. Time remaining before cancellation: ");
+            currentSummary.addMessage(Info, Monitor, waitingMsg);
+          }
+        }
+
         if (!futures.isEmpty()) {
           UtilWaitThread.sleep(3_000);
         }
+        firstIteration = false;
       }
 
       lastRunTime = NanoTime.now();
 
-      if (tookToLong) {
-        summary.clear();
-      } else {
-        
retainedProblemServers.asMap().keySet().forEach(summary::retainProblemServer);
-        summary.finish();
-
-        LOG.info("Finished fetching metrics from servers");
-        LOG.info(
-            "All: {}, Managers: {}, Garbage Collector: {}, Compactors: {}, 
Scan Servers: {}, Tablet Servers: {}",
-            allMetrics.estimatedSize(), summary.getManagers().size(),
-            summary.getGarbageCollector() != null,
-            summary.getCompactorAllMetricSummary().isEmpty() ? 0
-                : 
summary.getCompactorAllMetricSummary().entrySet().iterator().next().getValue()
-                    .count(),
-            summary.getSServerAllMetricSummary().isEmpty() ? 0
-                : 
summary.getSServerAllMetricSummary().entrySet().iterator().next().getValue()
-                    .count(),
-            summary.getTServerAllMetricSummary().isEmpty() ? 0 : summary
-                
.getTServerAllMetricSummary().entrySet().iterator().next().getValue().count());
-
-        SystemInformation oldSummary = summaryRef.getAndSet(summary);
-        if (oldSummary != null) {
-          oldSummary.clear();
-        }
+      
retainedProblemServers.asMap().keySet().forEach(summary::retainProblemServer);
+      summary.finish(failures, cancelled);
+
+      LOG.info("Finished fetching metrics from servers");
+      LOG.info(
+          "All: {}, Managers: {}, Garbage Collector: {}, Compactors: {}, Scan 
Servers: {}, Tablet Servers: {}",
+          allMetrics.estimatedSize(), summary.getManagers().size(),
+          summary.getGarbageCollector() != null,
+          summary.getCompactorAllMetricSummary().isEmpty() ? 0
+              : 
summary.getCompactorAllMetricSummary().entrySet().iterator().next().getValue()
+                  .count(),
+          summary.getSServerAllMetricSummary().isEmpty() ? 0
+              : 
summary.getSServerAllMetricSummary().entrySet().iterator().next().getValue()
+                  .count(),
+          summary.getTServerAllMetricSummary().isEmpty() ? 0 : 
summary.getTServerAllMetricSummary()
+              .entrySet().iterator().next().getValue().count());
+
+      SystemInformation oldSummary = summaryRef.getAndSet(summary);
+      if (oldSummary != null) {
+        oldSummary.clear();
       }
     }
 
diff --git 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
index b45bad0f3c..e5d7491f58 100644
--- 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
+++ 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
@@ -21,6 +21,8 @@ package org.apache.accumulo.monitor.next;
 import static com.google.common.base.Suppliers.memoize;
 import static org.apache.accumulo.core.metrics.MetricsInfo.QUEUE_TAG_KEY;
 import static 
org.apache.accumulo.monitor.next.SystemInformation.MessageCategory.Configuration;
+import static 
org.apache.accumulo.monitor.next.SystemInformation.MessageCategory.Monitor;
+import static 
org.apache.accumulo.monitor.next.SystemInformation.MessageCategory.Resource;
 import static 
org.apache.accumulo.monitor.next.SystemInformation.MessageCategory.Table;
 import static 
org.apache.accumulo.monitor.next.SystemInformation.MessagePriority.Critical;
 import static 
org.apache.accumulo.monitor.next.SystemInformation.MessagePriority.High;
@@ -44,6 +46,7 @@ import java.util.Set;
 import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentSkipListSet;
+import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.AtomicReference;
@@ -64,14 +67,22 @@ import org.apache.accumulo.core.data.TableId;
 import org.apache.accumulo.core.data.TabletId;
 import org.apache.accumulo.core.dataImpl.KeyExtent;
 import org.apache.accumulo.core.dataImpl.TabletIdImpl;
+import org.apache.accumulo.core.lock.ServiceLockPaths.AddressSelector;
+import org.apache.accumulo.core.lock.ServiceLockPaths.ResourceGroupPredicate;
+import org.apache.accumulo.core.lock.ServiceLockPaths.ServiceLockPath;
 import org.apache.accumulo.core.metadata.SystemTables;
 import org.apache.accumulo.core.metadata.TabletState;
+import org.apache.accumulo.core.metadata.schema.TabletMetadata.LocationType;
 import org.apache.accumulo.core.metrics.Metric;
 import org.apache.accumulo.core.metrics.flatbuffers.FMetric;
 import org.apache.accumulo.core.metrics.flatbuffers.FTag;
 import org.apache.accumulo.core.process.thrift.MetricResponse;
 import org.apache.accumulo.core.spi.balancer.TableLoadBalancer;
 import org.apache.accumulo.core.util.compaction.RunningCompactionInfo;
+import org.apache.accumulo.monitor.next.InformationFetcher.MetricFetcher;
+import 
org.apache.accumulo.monitor.next.InformationFetcher.TableInformationFetcher;
+import org.apache.accumulo.monitor.next.InformationFetcher.UpdateTaskFuture;
+import org.apache.accumulo.monitor.next.InformationFetcher.UpdateTasks;
 import org.apache.accumulo.monitor.next.deployment.DeploymentOverview;
 import org.apache.accumulo.monitor.next.views.ColumnFactory;
 import org.apache.accumulo.monitor.next.views.Status;
@@ -85,6 +96,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import com.github.benmanes.caffeine.cache.Cache;
+import com.google.common.net.HostAndPort;
 
 import io.micrometer.core.instrument.Clock;
 import io.micrometer.core.instrument.Meter.Id;
@@ -376,7 +388,73 @@ public class SystemInformation {
   }
 
   public enum MessageCategory {
-    Configuration, Table;
+    Configuration, Monitor, Resource, Table;
+  }
+
+  public class RecoveryOverview {
+
+    private final AtomicBoolean rootTabletRecovering = new 
AtomicBoolean(false);
+    private final AtomicLong metadataTabletsRecovering = new AtomicLong(0);
+    private final AtomicLong userTabletsRecovering = new AtomicLong(0);
+
+    public void setRootTabletRecovering(boolean recover) {
+      rootTabletRecovering.compareAndExchange(false, recover);
+    }
+
+    public boolean getRootTabletRecovering() {
+      return rootTabletRecovering.get();
+    }
+
+    public long getMetadataTabletsRecovering() {
+      return metadataTabletsRecovering.get();
+    }
+
+    public void setMetadataTabletsRecovering(long recover) {
+      metadataTabletsRecovering.compareAndSet(0, recover);
+    }
+
+    public long getUserTabletsRecovering() {
+      return userTabletsRecovering.get();
+    }
+
+    public void setUserTabletsRecovering(long recover) {
+      userTabletsRecovering.compareAndSet(0, recover);
+    }
+  }
+
+  public record LogSorts(String server, String resourceGroup, String type, 
Number inProgress,
+      Number avgProgress, Number longestDuration) {
+  }
+
+  public record TabletRecoveries(String server, String resourceGroup, Number 
started,
+      Number completed, Number failed, Number inProgress, Number 
mutationsReplayed) {
+  }
+
+  public record TabletNeedingRecovery(String tableId, String tabletId, String 
tabletDir,
+      String location) {
+  }
+
+  public class RecoveryInformation {
+    private final RecoveryOverview overview = new RecoveryOverview();
+    private final List<TabletNeedingRecovery> tabletsNeedingRecovery = new 
ArrayList<>();
+    private final List<LogSorts> serversPerformingLogSorting = new 
ArrayList<>();
+    private final List<TabletRecoveries> serversRecoveringTablets = new 
ArrayList<>();
+
+    public RecoveryOverview getOverview() {
+      return overview;
+    }
+
+    public List<LogSorts> getServersSortingLogs() {
+      return serversPerformingLogSorting;
+    }
+
+    public List<TabletRecoveries> getServersRecoveringTablets() {
+      return serversRecoveringTablets;
+    }
+
+    public List<TabletNeedingRecovery> getTabletsNeedingRecovery() {
+      return tabletsNeedingRecovery;
+    }
   }
 
   private static final Logger LOG = 
LoggerFactory.getLogger(SystemInformation.class);
@@ -435,6 +513,7 @@ public class SystemInformation {
   // Table Information
   private final Map<TableId,TableSummary> tables = new ConcurrentHashMap<>();
   private final Map<TableId,List<TabletInformation>> tablets = new 
ConcurrentHashMap<>();
+  private final RecoveryInformation recoveries = new RecoveryInformation();
 
   // Deployment Overview
   private final Map<ResourceGroupId,Map<ServerId.Type,ProcessSummary>> 
deployment =
@@ -493,11 +572,16 @@ public class SystemInformation {
     serverMetricsView.clear();
   }
 
-  private void addMessage(MessagePriority pri, MessageCategory cat, String 
msg) {
+  public void addMessage(MessagePriority pri, MessageCategory cat, String msg) 
{
     messages.computeIfAbsent(pri, k -> new EnumMap<>(MessageCategory.class))
         .computeIfAbsent(cat, k -> new TreeSet<>()).add(msg);
   }
 
+  public void removeMessage(MessagePriority pri, MessageCategory cat, String 
part) {
+    messages.getOrDefault(pri, new EnumMap<>(MessageCategory.class))
+        .getOrDefault(cat, new HashSet<String>()).removeIf(s -> 
s.contains(part));
+  }
+
   private void updateAggregates(final MetricResponse response,
       final Map<Id,CumulativeDistributionSummary> total,
       final Map<String,Map<Id,CumulativeDistributionSummary>> rg) {
@@ -535,6 +619,66 @@ public class SystemInformation {
 
   }
 
+  private void captureRecoveriesInProgress(final ServerId server, final 
MetricResponse response) {
+    if (TableDataFactory.hasMetricData(response)) {
+      Number logSortsInProgress = 0;
+      Number logSortsAvgProgress = 0;
+      Number logSortsLongestRuntime = 0;
+      Number tabletRecoveriesStarted = 0;
+      Number tabletRecoveriesCompleted = 0;
+      Number tabletRecoveriesFailed = 0;
+      Number tabletRecoveriesInProgress = 0;
+      Number tabletRecoveriesMutationsReplayed = 0;
+      for (ByteBuffer bb : response.getMetrics()) {
+        final FMetric fm = FMetric.getRootAsFMetric(bb);
+        final String name = fm.name();
+        final Metric m = Metric.fromName(name);
+        switch (m) {
+          case RECOVERIES_SORTS_IN_PROGRESS:
+            logSortsInProgress = getMetricValue(fm);
+            break;
+          case RECOVERIES_SORTS_AVG_PROGRESS:
+            logSortsAvgProgress = getMetricValue(fm);
+            break;
+          case RECOVERIES_SORTS_LONGEST_RUNTIME:
+            logSortsLongestRuntime = getMetricValue(fm);
+            break;
+          case RECOVERIES_TABLETS_STARTED:
+            tabletRecoveriesStarted = getMetricValue(fm);
+            break;
+          case RECOVERIES_TABLETS_COMPLETED:
+            tabletRecoveriesCompleted = getMetricValue(fm);
+            break;
+          case RECOVERIES_TABLETS_FAILED:
+            tabletRecoveriesFailed = getMetricValue(fm);
+            break;
+          case RECOVERIES_TABLETS_IN_PROGRESS:
+            tabletRecoveriesInProgress = getMetricValue(fm);
+            break;
+          case RECOVERIES_TABLETS_MUTATIONS_REPLAYED:
+            tabletRecoveriesMutationsReplayed = getMetricValue(fm);
+            break;
+          default:
+            break;
+        }
+      }
+      if (logSortsInProgress.longValue() > 0) {
+        this.recoveries.getServersSortingLogs()
+            .add(new LogSorts(server.toHostPortString(), 
server.getResourceGroup().canonical(),
+                server.getType().name(), logSortsInProgress, 
logSortsAvgProgress,
+                logSortsLongestRuntime));
+      }
+      if (tabletRecoveriesInProgress.longValue() > 0) {
+        this.recoveries.getServersRecoveringTablets()
+            .add(new TabletRecoveries(server.toHostPortString(),
+                server.getResourceGroup().canonical(), 
tabletRecoveriesStarted.longValue(),
+                tabletRecoveriesCompleted.longValue(), 
tabletRecoveriesFailed.longValue(),
+                tabletRecoveriesInProgress.longValue(),
+                tabletRecoveriesMutationsReplayed.longValue()));
+      }
+    }
+  }
+
   private TableData createCompactionQueueSummary(final Set<ServerId> managers) 
{
 
     final Column COMPACTION_QUEUE_COL =
@@ -580,7 +724,7 @@ public class SystemInformation {
 
     for (ServerId manager : managers) {
       MetricResponse response = allMetrics.getIfPresent(manager);
-      if (response.getMetrics() != null) {
+      if (response != null && response.getMetrics() != null) {
 
         FMetric fm = new FMetric();
         FTag t = new FTag();
@@ -612,7 +756,8 @@ public class SystemInformation {
     return TableDataFactory.forColumns(Set.of(), Map.of(), timestamp.get(), 
cols);
   }
 
-  public void processResponse(final ServerId server, final MetricResponse 
response) {
+  public void processResponse(final ServerId server, final MetricResponse 
response,
+      final UpdateTasks callback) {
     problemHosts.remove(server);
     metricProblemHosts.remove(server);
     retainedProblemHosts.remove(server);
@@ -620,6 +765,7 @@ public class SystemInformation {
     resourceGroups.add(response.getResourceGroup());
     deployment.computeIfAbsent(server.getResourceGroup(), g -> new 
ConcurrentHashMap<>())
         .computeIfAbsent(server.getType(), t -> new 
ProcessSummary()).addResponded(server);
+    captureRecoveriesInProgress(server, response);
     switch (response.serverType) {
       case COMPACTOR:
         compactors
@@ -634,6 +780,32 @@ public class SystemInformation {
         break;
       case MANAGER:
         managers.add(server);
+        FMetric flatbuffer = new FMetric();
+        for (ByteBuffer binary : response.getMetrics()) {
+          flatbuffer = FMetric.getRootAsFMetric(binary, flatbuffer);
+          final String metricName = flatbuffer.name();
+          if (metricName.equals(Metric.MANAGER_ROOT_TGW_RECOVERY.getName())) {
+            boolean recovering = getMetricValue(flatbuffer).longValue() > 0;
+            this.recoveries.getOverview().setRootTabletRecovering(recovering);
+            if (recovering) {
+              addMessage(Critical, Table, "The root table requires recovery");
+            }
+          } else if 
(metricName.equals(Metric.MANAGER_META_TGW_RECOVERY.getName())) {
+            long tablets = getMetricValue(flatbuffer).longValue();
+            
this.recoveries.getOverview().setMetadataTabletsRecovering(tablets);
+            if (tablets > 0) {
+              addMessage(Critical, Table,
+                  "At least " + tablets + " metadata table tablets require 
recovery");
+            }
+          } else if 
(metricName.equals(Metric.MANAGER_USER_TGW_RECOVERY.getName())) {
+            long tablets = getMetricValue(flatbuffer).longValue();
+            this.recoveries.getOverview().setUserTabletsRecovering(tablets);
+            if (tablets > 0) {
+              addMessage(High, Table,
+                  "At least " + tablets + " user table tablets require 
recovery");
+            }
+          }
+        }
         break;
       case SCAN_SERVER:
         sservers.computeIfAbsent(response.getResourceGroup(), (rg) -> 
ConcurrentHashMap.newKeySet())
@@ -676,9 +848,42 @@ public class SystemInformation {
     tablets.computeIfAbsent(tableId, (t) -> Collections.synchronizedList(new 
ArrayList<>()))
         .add(sti);
     tables.computeIfAbsent(tableId, (t) -> new 
TableSummary(tableName)).addTablet(sti);
-    if (sti.getEstimatedEntries() == 0) {
-      addMessage(Info, Table, "Tablet " + sti.getTabletId().toString() + " 
(tid: "
-          + sti.getTabletId().getTable() + ") may have zero entries and could 
be merged.");
+    if (sti.getNumWalLogs() > 0) {
+      String loc = sti.getLocation().orElse("");
+      int idx = loc.indexOf(':');
+      if (loc.length() > 0 && idx > 0) {
+        try {
+          LocationType type = LocationType.valueOf(loc.substring(0, idx));
+          if (type == LocationType.FUTURE) {
+            // When the location is future, then recovery either has not 
occurred yet, or
+            // is occurring right now. Location is set to current once 
recovery is complete.
+            this.recoveries.getTabletsNeedingRecovery()
+                .add(new 
TabletNeedingRecovery(info.getTabletId().getTable().canonical(),
+                    sti.getTabletId().toString(), sti.getTabletDir(),
+                    sti.getLocation().orElse("")));
+          } else if (type == LocationType.CURRENT) {
+            // If the location type is current, but there is no tserver at 
that location
+            // with a lock, then this tablet needs recovery but has not been 
assigned a
+            // new location yet.
+            Set<ServiceLockPath> servers =
+                
ctx.getServerPaths().getTabletServer(ResourceGroupPredicate.ANY,
+                    
AddressSelector.exact(HostAndPort.fromString(loc.substring(idx + 1))), true);
+            if (servers == null || servers.isEmpty()) {
+              this.recoveries.getTabletsNeedingRecovery()
+                  .add(new 
TabletNeedingRecovery(info.getTabletId().getTable().canonical(),
+                      sti.getTabletId().toString(), sti.getTabletDir(),
+                      sti.getLocation().orElse("")));
+            }
+          }
+        } catch (IllegalArgumentException e) {
+          LOG.error("Unable to determine LocationType from wal location: {}", 
loc);
+        }
+      } else {
+        // No location, but has logs, still needs recovery
+        this.recoveries.getTabletsNeedingRecovery()
+            .add(new 
TabletNeedingRecovery(info.getTabletId().getTable().canonical(),
+                sti.getTabletId().toString(), sti.getTabletDir(), 
sti.getLocation().orElse("")));
+      }
     }
   }
 
@@ -703,13 +908,131 @@ public class SystemInformation {
     configuredCompactionResourceGroups.addAll(groups);
   }
 
-  public void finish() {
-    // Update the deployment not-responded numbers based
-    // on metric fetch failures for this refresh.
-    metricProblemHosts.forEach(serverId -> {
-      deployment.computeIfAbsent(serverId.getResourceGroup(), g -> new 
ConcurrentHashMap<>())
-          .computeIfAbsent(serverId.getType(), t -> new 
ProcessSummary()).addNotResponded(serverId);
+  private void computeMessages(final List<UpdateTaskFuture> failures,
+      final List<UpdateTaskFuture> cancelled) {
+
+    if (failures.size() > 0) {
+      addMessage(High, Monitor,
+          "There were " + failures.size() + " failures in the last monitor 
update cycle."
+              + " Information displayed may be out of date or missing.");
+    }
+
+    if (cancelled.size() > 0) {
+      final long monitorFetchTimeout =
+          
ctx.getConfiguration().getTimeInMillis(Property.MONITOR_FETCH_TIMEOUT);
+      String message =
+          "Fetching information for Monitor has taken longer than %1$d ms. 
(%2$d) tasks were cancelled."
+              + " Information displayed may be out of date or missing. Resolve 
the issue causing this or increase property `%3$s`.";
+      addMessage(High, Monitor, String.format(message, monitorFetchTimeout, 
cancelled.size(),
+          Property.MONITOR_FETCH_TIMEOUT.getKey()));
+    }
+
+    Set<ServerId> failedOrCancelledServers = new HashSet<>();
+    Set<TableId> failedOrCancelledTables = new HashSet<>();
+    for (UpdateTaskFuture f : failures) {
+      switch (f.task().getType()) {
+        case COMPACTION:
+          addMessage(Info, Monitor,
+              "The task to get information about currently running compactions 
failed");
+          break;
+        case COMPACTION_RGS:
+          addMessage(Info, Monitor,
+              "The task to get information about configured compaction 
resource groups failed");
+          break;
+        case METRIC:
+          ServerId s = ((MetricFetcher) f.task()).getResource();
+          failedOrCancelledServers.add(s);
+          break;
+        case TABLE:
+          TableId t = ((TableInformationFetcher) f.task()).getResource();
+          failedOrCancelledTables.add(t);
+          break;
+        default:
+          break;
+      }
+    }
+
+    for (UpdateTaskFuture f : cancelled) {
+      switch (f.task().getType()) {
+        case COMPACTION:
+          addMessage(Info, Monitor,
+              "The task to get information about currently running compactions 
was cancelled");
+          break;
+        case COMPACTION_RGS:
+          addMessage(Info, Monitor,
+              "The task to get information about configured compaction 
resource groups was cancelled");
+          break;
+        case METRIC:
+          ServerId s = ((MetricFetcher) f.task()).getResource();
+          failedOrCancelledServers.add(s);
+          break;
+        case TABLE:
+          TableId t = ((TableInformationFetcher) f.task()).getResource();
+          failedOrCancelledTables.add(t);
+          break;
+        default:
+          break;
+      }
+    }
+
+    if (failedOrCancelledServers.size() > 0) {
+      addMessage(High, Monitor, failedOrCancelledServers.size()
+          + " tasks to get information from servers were failed or 
cancelled.");
+      addMessage(Info, Monitor,
+          "The Monitor is not displaying updated information for the following 
servers: "
+              + failedOrCancelledServers);
+    }
+
+    if (failedOrCancelledTables.size() > 0) {
+      addMessage(High, Monitor, failedOrCancelledTables.size()
+          + " tasks to get information for tables were failed or cancelled.");
+      addMessage(Info, Monitor,
+          "The Monitor is not displaying updated information for the following 
tables: "
+              + failedOrCancelledTables);
+    }
+
+    if (managers.isEmpty()) {
+      addMessage(Critical, Resource, "No Managers are running");
+    }
+
+    if (gc.get() == null) {
+      addMessage(Critical, Resource, "Garbage Collector is not running");
+    }
+
+    if (problemHosts.size() > 0) {
+      addMessage(Info, Resource, "Monitor has not received a response from " + 
problemHosts.size()
+          + " servers recently: " + problemHosts);
+    }
+
+    if (metricProblemHosts.size() > 0) {
+      addMessage(Info, Resource,
+          "Unable to gather information from " + metricProblemHosts.size() + " 
servers");
+    }
+
+    for (ResourceGroupId rg : ctx.resourceGroupOperations().list()) {
+      if (rg == ResourceGroupId.DEFAULT) {
+        continue;
+      }
+      if (!compactors.containsKey(rg.canonical()) && 
!sservers.containsKey(rg.canonical())
+          && !tservers.containsKey(rg.canonical())) {
+        addMessage(Info, Configuration, "Resource Group " + rg
+            + " exists, but no resources assigned. Consider removing the 
resource group with command `accumulo inst init --remove-resource-groups`");
+      }
+    }
+
+    tablets.forEach((tid, tablets) -> {
+      int empty = 0;
+      for (TabletInformation tablet : tablets) {
+        if (tablet.getEstimatedEntries() == 0) {
+          empty++;
+        }
+      }
+      if (empty > 0) {
+        addMessage(Info, Table,
+            "Table " + tid + " may have " + empty + " tablets that could be 
merged.");
+      }
     });
+
     for (SystemTables table : SystemTables.values()) {
       TableConfiguration tconf = 
this.ctx.getTableConfiguration(table.tableId());
       String balancerRG = 
tconf.get(TableLoadBalancer.TABLE_ASSIGNMENT_GROUP_PROPERTY);
@@ -721,6 +1044,31 @@ public class SystemInformation {
       }
     }
 
+    FMetric flatbuffer = new FMetric();
+    long serversWithZombieScans = 0;
+    for (Entry<ServerId,MetricResponse> e : allMetrics.asMap().entrySet()) {
+      ServerId sid = e.getKey();
+      MetricResponse mr = e.getValue();
+      if (mr != null) {
+        List<ByteBuffer> metrics = mr.metrics;
+        if (sid.getType() == ServerId.Type.SCAN_SERVER
+            || sid.getType() == ServerId.Type.TABLET_SERVER) {
+          for (ByteBuffer binary : metrics) {
+            flatbuffer = FMetric.getRootAsFMetric(binary, flatbuffer);
+            if 
(flatbuffer.name().equals(Metric.SCAN_ZOMBIE_THREADS.getName())) {
+              if (getMetricValue(flatbuffer).longValue() > 0) {
+                serversWithZombieScans++;
+              }
+            }
+          }
+        }
+      }
+    }
+    if (serversWithZombieScans > 0) {
+      addMessage(High, Resource,
+          "There are " + serversWithZombieScans + " servers with zombie scan 
threads");
+    }
+
     for (String rg : getResourceGroups()) {
       Set<ServerId> rgCompactors = getCompactorResourceGroupServers(rg);
       List<FMetric> metrics = queueMetrics.get(rg);
@@ -749,7 +1097,7 @@ public class SystemInformation {
             if (idleMetric.isPresent()) {
               var metric = idleMetric.orElseThrow().getValue();
               if (metric.max() == 1.0D) {
-                addMessage(High, Configuration,
+                addMessage(High, Resource,
                     "Compactor group " + rg + " has queued jobs and idle 
compactors.");
               }
             }
@@ -766,6 +1114,19 @@ public class SystemInformation {
       }
     }
 
+  }
+
+  public void finish(final List<UpdateTaskFuture> failures,
+      final List<UpdateTaskFuture> cancelled) {
+    // Update the deployment not-responded numbers based
+    // on metric fetch failures for this refresh.
+    metricProblemHosts.forEach(serverId -> {
+      deployment.computeIfAbsent(serverId.getResourceGroup(), g -> new 
ConcurrentHashMap<>())
+          .computeIfAbsent(serverId.getType(), t -> new 
ProcessSummary()).addNotResponded(serverId);
+    });
+
+    computeMessages(failures, cancelled);
+
     timestamp.set(System.currentTimeMillis());
     componentStatuses.clear();
     for (final ServerId.Type type : ServerId.Type.values()) {
@@ -979,6 +1340,10 @@ public class SystemInformation {
     return this.tablets.get(tableId);
   }
 
+  public RecoveryInformation getRecoveryInformation() {
+    return this.recoveries;
+  }
+
   public DeploymentOverview getDeploymentView() {
     return this.deploymentOverview;
   }
diff --git 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/view/WebViews.java 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/view/WebViews.java
index bc09781c95..f373ea2463 100644
--- 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/view/WebViews.java
+++ 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/view/WebViews.java
@@ -223,6 +223,24 @@ public class WebViews {
     return model;
   }
 
+  /**
+   * Returns the recovery template
+   *
+   * @return Recovery model
+   */
+  @GET
+  @Path("recovery")
+  @Template(name = "/default.ftl")
+  public Map<String,Object> getRecoveryInformation() {
+
+    Map<String,Object> model = getModel();
+    model.put("title", "Tablet Recoveries");
+    model.put("template", "recovery.ftl");
+    model.put("js", "recovery.js");
+
+    return model;
+  }
+
   /**
    * Returns the scans template
    *
diff --git 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/functions.js
 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/functions.js
index 38ef5981e7..093d2d1d75 100644
--- 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/functions.js
+++ 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/functions.js
@@ -40,6 +40,7 @@ const RUNNING_COMPACTIONS_BY_GROUP = 
'runningCompactionsByGroup';
 const AUTO_REFRESH_KEY = 'auto-refresh';
 const MESSAGE_CATEGORIES = 'messageCategories';
 const MESSAGES = 'messages';
+const RECOVERY = 'recovery';
 
 // Override Length Menu options for dataTables
 if ($.fn && $.fn.dataTable) {
@@ -489,13 +490,6 @@ function getServerStats() {
   return getJSONForTable(contextPath + 'rest/tservers/serverStats', 
'serverStats');
 }
 
-/**
- * REST GET call for the recovery list, stores it on a sessionStorage variable
- */
-function getRecoveryList() {
-  return getJSONForTable(contextPath + 'rest/tservers/recovery', 
'recoveryList');
-}
-
 /**
  * REST GET call for the participating tablet servers,
  * stores it on a sessionStorage variable
@@ -621,6 +615,14 @@ function getMetrics() {
   return getJSONForTable(REST_V2_PREFIX + '/metrics', 'metrics');
 }
 
+/**
+ * REST GET call for /recovery,
+ * stores it on a sessionStorage variable
+ */
+function getRecoveryInformation() {
+  return getJSONForTable(REST_V2_PREFIX + '/recovery', RECOVERY);
+}
+
 /**
  * REST GET call for /status,
  * stores it on a sessionStorage variable
diff --git 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/manager.js
 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/manager.js
index 569cc3c59e..b7a133b3f3 100644
--- 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/manager.js
+++ 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/manager.js
@@ -93,114 +93,3 @@ $(function () {
 
   refresh();
 });
-
-
-
-
-
-// TODO: 6106 - left code commented for the recovery list table to be re-added
-
-/*
-"use strict";
-
-var managerStatusTable, recoveryListTable, managerStatus;
-
-
-
-*/
-/**
- * Populates tables with the new information
- */
-/*
-function refreshManagerTables() {
-  getStatus().then(function () {
-    managerStatus = JSON.parse(sessionStorage.status).managerStatus;
-    refreshManagerBanners();
-    if (managerStatusTable === undefined && managerStatus !== 'ERROR') {
-      // Can happen if the manager is dead on first loading the page, but 
later comes back online
-      // while using auto-refresh
-      createManagerTable();
-    } else if (managerStatus !== 'ERROR') {
-      ajaxReloadTable(managerStatusTable);
-    }
-    ajaxReloadTable(recoveryListTable);
-  });
-}
-*/
-/*
- * The tables.ftl refresh function will do this functionality.
- * If tables are removed from Manager, uncomment this function.
- */
-/**
- * Used to redraw the page
- */
-/*function refresh() {
-  refreshManager();
-}*/
-
-/**
- * Creates initial tables
- */
-/*
-$(function () {
-
-  getStatus().then(function () {
-    managerStatus = JSON.parse(sessionStorage.status).managerStatus;
-    if (managerStatus !== 'ERROR') {
-      createManagerTable();
-    }
-
-    // Generates the recovery table
-    recoveryListTable = $('#recoveryList').DataTable({
-      "ajax": {
-        "url": contextPath + 'rest/tservers/recovery',
-        "dataSrc": function (data) {
-          data = data.recoveryList;
-          if (data.length === 0) {
-            console.info('Recovery list is empty, hiding recovery table');
-            $('#recoveryList_wrapper').hide();
-          } else {
-            $('#recoveryList_wrapper').show();
-          }
-          return data;
-        }
-      },
-      "columnDefs": [{
-          "targets": "duration",
-          "render": function (data, type) {
-            if (type === 'display') {
-              data = timeDuration(parseInt(data, 10));
-            }
-            return data;
-          }
-        },
-        {
-          "targets": "percent",
-          "render": function (data, type) {
-            if (type === 'display') {
-              data = (data * 100).toFixed(2) + '%';
-            }
-            return data;
-          }
-        }
-      ],
-      "stateSave": true,
-      "columns": [{
-          "data": "server"
-        },
-        {
-          "data": "log"
-        },
-        {
-          "data": "time"
-        },
-        {
-          "data": "progress"
-        }
-      ]
-    });
-
-    refreshManagerTables();
-  });
-});
-*/
diff --git 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/recovery.js
 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/recovery.js
new file mode 100644
index 0000000000..c9cf218609
--- /dev/null
+++ 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/recovery.js
@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+"use strict";
+
+const overviewTableElement = '#recovery-overview';
+const overviewTableDivElement = '#recovery-overview_wrapper';
+const tabletRecoveryTableElement = '#tablets-needing-recovery';
+const tabletRecoveryDivElement = '#tablets-needing-recovery_wrapper';
+const sortingServersTableElement = '#servers-sorting';
+const sortingServersDivElement = '#servers-sorting_wrapper';
+const replayingServersTableElement = '#servers-replaying';
+const replayingServersDivElement = '#servers-replaying_wrapper';
+
+
+var overviewDataTable;
+var tabletDataTable;
+var sortingDataTable;
+var replayingDataTable;
+
+function getOverview() {
+  var arr = [];
+  var overview = getStoredView(RECOVERY).overview;
+  arr.push(overview);
+  return arr;
+}
+
+function getTablets() {
+  return getStoredView(RECOVERY).tabletsNeedingRecovery;
+}
+
+function getSorting() {
+  return getStoredView(RECOVERY).serversSortingLogs;
+}
+
+function getReplaying() {
+  return getStoredView(RECOVERY).serversRecoveringTablets;
+}
+
+function showOrHide(divElement, dataArray) {
+  if (dataArray.length == 0) {
+    $(divElement).hide();
+  } else {
+    $(divElement).show();
+  }
+}
+
+function refresh() {
+  $.when(getRecoveryInformation()).then(function () {
+    ajaxReloadTable(overviewDataTable);
+    ajaxReloadTable(tabletDataTable);
+    ajaxReloadTable(sortingDataTable);
+    ajaxReloadTable(replayingDataTable);
+  }).fail(function () {
+    sessionStorage[RECOVERY] = JSON.stringify({
+      overview: {
+        rootTabletRecovering: false,
+        metadataTabletsRecovering: 0,
+        userTabletsRecovering: 0
+      },
+      tabletsNeedingRecovery: [],
+      serversRecoveringTablets: [],
+      serversSortingLogs: []
+    });
+    ajaxReloadTable(overviewDataTable);
+    ajaxReloadTable(tabletDataTable);
+    ajaxReloadTable(sortingDataTable);
+    ajaxReloadTable(replayingDataTable);
+  });
+  showOrHide(overviewTableDivElement, getOverview());
+  showOrHide(tabletRecoveryDivElement, getTablets());
+  showOrHide(sortingServersDivElement, getSorting());
+  showOrHide(replayingServersDivElement, getReplaying());
+}
+
+$(function () {
+  sessionStorage[RECOVERY] = JSON.stringify({
+    overview: {
+      rootTabletRecovering: false,
+      metadataTabletsRecovering: 0,
+      userTabletsRecovering: 0
+    },
+    tabletsNeedingRecovery: [],
+    serversRecoveringTablets: [],
+    serversSortingLogs: []
+  });
+
+  overviewDataTable = $(overviewTableElement).DataTable({
+    "ajax": function (data, callback) {
+      callback({
+        data: getOverview()
+      });
+    },
+    "info": false,
+    "lengthChange": false,
+    "paging": false,
+    "searching": false,
+    "stateSave": true,
+    "colReorder": true,
+    "columnDefs": [{
+      targets: '_all',
+      defaultContent: '-'
+    }],
+    "columns": [{
+        "data": "rootTabletRecovering"
+      },
+      {
+        "data": "metadataTabletsRecovering"
+      },
+      {
+        "data": "userTabletsRecovering"
+      }
+    ]
+  });
+
+  tabletDataTable = $(tabletRecoveryTableElement).DataTable({
+    "ajax": function (data, callback) {
+      callback({
+        data: getTablets()
+      });
+    },
+    "stateSave": true,
+    "colReorder": true,
+    "columnDefs": [{
+      targets: '_all',
+      defaultContent: '-'
+    }],
+    "columns": [{
+        "data": "tableId"
+      },
+      {
+        "data": "tabletId"
+      },
+      {
+        "data": "tabletDir"
+      },
+      {
+        "data": "location"
+      }
+    ]
+  });
+
+  sortingDataTable = $(sortingServersTableElement).DataTable({
+    "ajax": function (data, callback) {
+      callback({
+        data: getSorting()
+      });
+    },
+    "stateSave": true,
+    "colReorder": true,
+    "columnDefs": [{
+      targets: '_all',
+      defaultContent: '-'
+    }],
+    "columns": [{
+        "data": "server"
+      },
+      {
+        "data": "resourceGroup"
+      },
+      {
+        "data": "type"
+      },
+      {
+        "data": "inProgress"
+      },
+      {
+        "data": "avgProgress",
+        "type": "html",
+        "render": function (data, type, row, meta) {
+          if (type === 'display') {
+            if (row.avgProgress < 0) {
+              data = '--';
+            } else {
+              var p = Math.round(Number(row.avgProgress * 100));
+              console.log("Compaction progress = %" + p);
+              data = '<div class="progress"><div class="progress-bar" 
role="progressbar" style="min-width: 2em; width:' +
+                p + '%;">' + p + '%</div></div>';
+            }
+          }
+          return data;
+        }
+      },
+      {
+        "data": "longestDuration",
+        "render": function (data, type) {
+          if (type === 'display') {
+            if (data === null || data === undefined) {
+              return '&mdash;';
+            }
+            data = timeDuration(data);
+          }
+          return data;
+        }
+      }
+    ]
+  });
+
+  replayingDataTable = $(replayingServersTableElement).DataTable({
+    "ajax": function (data, callback) {
+      callback({
+        data: getReplaying()
+      });
+    },
+    "stateSave": true,
+    "colReorder": true,
+    "columnDefs": [{
+      targets: '_all',
+      defaultContent: '-'
+    }],
+    "columns": [{
+        "data": "server"
+      },
+      {
+        "data": "resourceGroup"
+      },
+      {
+        "data": "started"
+      },
+      {
+        "data": "completed"
+      },
+      {
+        "data": "failed"
+      },
+      {
+        "data": "inProgress"
+      },
+      {
+        "data": "mutationsReplayed"
+      }
+    ]
+  });
+
+  refresh();
+
+});
diff --git 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
index d877bd9612..1a05aea70d 100644
--- 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
+++ 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
@@ -22,42 +22,6 @@ const htmlBanner = '#tserversStatusBanner'
 const htmlBannerMessage = '#tservers-banner-message'
 const htmlTable = '#tservers'
 var tserversTable;
-var recoveryList = [];
-
-/**
- * Checks if the given server is in the global recoveryList variable
- * 
- * @param {JSON} server json server object
- * @returns true if the server is in the recoveryList, else false
- */
-function serverIsInRecoveryList(server) {
-  return recoveryList.includes(server.hostname);
-}
-
-/**
- * Refreshes the list of recovering tservers and shows/hides the recovery 
caption
- */
-function refreshRecoveryList() {
-  getRecoveryList().then(function () {
-    var sessionStorageRecoveryList, sessionStorageTserversList;
-
-    // get list of recovering servers and online servers from sessionStorage
-    sessionStorageRecoveryList = sessionStorage.recoveryList === undefined ? 
[] : JSON.parse(sessionStorage.recoveryList).recoveryList;
-    sessionStorageTserversList = sessionStorage.tservers === undefined ? [] : 
JSON.parse(sessionStorage.tservers).servers;
-
-    // update global recovery list variable
-    recoveryList = sessionStorageRecoveryList.map(function (entry) {
-      return entry.server;
-    });
-
-    // show the recovery caption if any online servers are in the recovery list
-    if (sessionStorageTserversList.some(serverIsInRecoveryList)) {
-      $('#recovery-caption').show();
-    } else {
-      $('#recovery-caption').hide();
-    }
-  });
-}
 
 /**
  * Show a page banner that matches the tablet server status shown in the 
navbar.
@@ -79,16 +43,12 @@ function refreshTServersBanner() {
 
 
 function refresh() {
-  refreshRecoveryList();
   refreshServerInformation(getTserversView, htmlTable, 
TABLET_SERVER_PROCESS_VIEW, htmlBanner,
     htmlBannerMessage);
   refreshTServersBanner();
 }
 
 $(function () {
-
-  refreshRecoveryList();
-
   sessionStorage[TABLET_SERVER_PROCESS_VIEW] = JSON.stringify({
     data: [],
     columns: []
diff --git 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/manager.ftl
 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/manager.ftl
index ea900b3fcb..503ade72d5 100644
--- 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/manager.ftl
+++ 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/manager.ftl
@@ -37,21 +37,3 @@
         <#include "table_loading.ftl" >
     </table>
     <br />
-    <!--
-    <table id="recoveryList" class="table caption-top table-bordered 
table-striped table-condensed">
-        <caption><span class="table-caption">Log&nbsp;Recovery</span><br />
-            <span class="table-subcaption">Some tablets were unloaded in an 
unsafe manner. Write-ahead logs are being
-                recovered.</span><br />
-        </caption>
-        <thead>
-            <tr>
-                <th>Server</th>
-                <th>Log</th>
-                <th class="duration">Time</th>
-                <th class="percent">Progress</th>
-            </tr>
-        </thead>
-        <tbody></tbody>
-    </table>
-    <br />
-    -->
\ No newline at end of file
diff --git 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/navbar.ftl
 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/navbar.ftl
index d29518c380..fc188ca618 100644
--- 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/navbar.ftl
+++ 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/navbar.ftl
@@ -58,6 +58,7 @@
                 <li><a class="link-body-emphasis dropdown-item" 
href="coordinator">Compaction Overview</a></li>
                 <li><a class="link-body-emphasis dropdown-item" 
href="ec">Compaction Details</a></li>
                 <li><a class="link-body-emphasis dropdown-item" 
href="scans">Scans</a></li>
+                <li><a class="link-body-emphasis dropdown-item" 
href="recovery">Tablet Recoveries</a></li>
               </ul>
             </li>
             <li>
diff --git 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/recovery.ftl
 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/recovery.ftl
new file mode 100644
index 0000000000..2a9e36b983
--- /dev/null
+++ 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/recovery.ftl
@@ -0,0 +1,96 @@
+<#--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+-->
+      <div>
+        <table id="recovery-overview" class="table caption-top table-bordered 
table-striped table-condensed">
+          <caption><span class="table-caption">Table Recovery 
Overview</span><br />
+            <span>The Manager is reporting the following tablets need log 
sorting for recovery</span><br />
+          </caption>
+          <thead>
+            <tr>
+              <th>Root Table</th>
+              <th>Metadata Table Tablets</th>
+              <th>User Table Tablets</th>
+            </tr>
+          </thead>
+          <tbody></tbody>
+        </table>
+        <br />
+      </div>
+      <div>
+        <table id="tablets-needing-recovery" class="table caption-top 
table-bordered table-striped table-condensed">
+          <caption><span class="table-caption">Tablets Requiring 
Recovery</span><br />
+            <span>The system tables are reporting the following tablets need 
recovery (log sorting + mutations replayed)</span><br />
+          </caption>
+          <thead>
+            <tr>
+              <th>Table Id</th>
+              <th>TabletId</th>
+              <th>Tablet Directory</th>
+              <th>Location</th>
+            </tr>
+          </thead>
+          <tbody></tbody>
+        </table>
+        <br />
+      </div>
+      <div>
+        <table id="servers-sorting" class="table caption-top table-bordered 
table-striped table-condensed">
+          <caption><span class="table-caption">Servers Sorting WALs</span><br 
/>
+            <span>The following servers have reported WAL sort activity.<br />
+            Compactors will sort WAL files when not performing a 
compaction.<br />
+            Scan Servers and Tablet Servers will sort WAL files
+            in accordance with their properties 
("sserver.wal.sort.concurrent.max` and 
`tserver.wal.sort.concurrent.max`)</span><br />
+          </caption>          
+          <thead>
+            <tr>
+              <th>Server</th>
+              <th>Resource Group</th>
+              <th>Server Type</th>
+              <th>WAL Sorts In Progress</th>
+              <th>WAL Sorts Avg Progress</th>
+              <th>WAL Sorts Longest Duration</th>
+            </tr>
+          </thead>
+          <tbody></tbody>
+        </table>
+        <br />
+      </div>
+      <div>
+        <table id="servers-replaying" class="table caption-top table-bordered 
table-striped table-condensed">
+          <caption>
+            <span class="table-caption">Tablet Servers Recovering 
Tablets</span><br />
+            <span>The following Tablet Servers have reported Tablet recovery 
activity.</span><br />
+          </caption>          
+          <thead>
+            <tr>
+              <th>Server</th>
+              <th>Resource Group</th>
+              <th>Recoveries Started</th>
+              <th>Recoveries Completed</th>
+              <th>Recoveries Failed</th>
+              <th>Recoveries In Progress</th>
+              <th>Mutations Replayed</th>
+            </tr>
+          </thead>
+          <tbody></tbody>
+        </table>
+      </div>
+      
\ No newline at end of file
diff --git 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
index 06cfd0009e..2973391603 100644
--- 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
+++ 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
@@ -26,7 +26,6 @@
     </div>    
     <div class="row">
       <div class="col-xs-12">
-        <span id="recovery-caption" style="background-color: gold; display: 
none;">Highlighted rows correspond to tservers in recovery mode.</span>
         <table id="tservers" class="table caption-top table-bordered 
table-striped table-condensed">
           <caption><span class="table-caption">Tablet Servers</span><br />
             <span class="table-subcaption">The following Tablet Servers 
reported status.</span><br />
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
index 5b7784facf..116876fe36 100644
--- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
+++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
@@ -135,6 +135,7 @@ import org.apache.accumulo.tserver.log.TabletServerLogger;
 import org.apache.accumulo.tserver.managermessage.ManagerMessage;
 import org.apache.accumulo.tserver.metrics.TabletServerMetrics;
 import org.apache.accumulo.tserver.metrics.TabletServerMinCMetrics;
+import org.apache.accumulo.tserver.metrics.TabletServerRecoveryMetrics;
 import org.apache.accumulo.tserver.metrics.TabletServerScanMetrics;
 import org.apache.accumulo.tserver.metrics.TabletServerUpdateMetrics;
 import org.apache.accumulo.tserver.scan.ScanRunState;
@@ -167,6 +168,7 @@ public class TabletServer extends AbstractServer implements 
TabletHostingServer
   TabletServerMinCMetrics mincMetrics;
   PausedCompactionMetrics pausedMetrics;
   BlockCacheMetrics blockCacheMetrics;
+  TabletServerRecoveryMetrics recoveryMetrics;
 
   @Override
   public TabletServerScanMetrics getScanMetrics() {
@@ -182,6 +184,10 @@ public class TabletServer extends AbstractServer 
implements TabletHostingServer
     return pausedMetrics;
   }
 
+  public TabletServerRecoveryMetrics getTabletRecoveryMetrics() {
+    return recoveryMetrics;
+  }
+
   private final LogSorter logSorter;
   final TabletStatsKeeper statsKeeper;
   private final AtomicInteger logIdGenerator = new AtomicInteger();
@@ -556,9 +562,10 @@ public class TabletServer extends AbstractServer 
implements TabletHostingServer
     pausedMetrics = new PausedCompactionMetrics();
     blockCacheMetrics = new 
BlockCacheMetrics(this.resourceManager.getIndexCache(),
         this.resourceManager.getDataCache(), 
this.resourceManager.getSummaryCache());
+    recoveryMetrics = new TabletServerRecoveryMetrics();
 
     metricsInfo.addMetricsProducers(this, metrics, updateMetrics, scanMetrics, 
mincMetrics,
-        pausedMetrics, blockCacheMetrics, logSorter);
+        pausedMetrics, blockCacheMetrics, logSorter, recoveryMetrics);
     metricsInfo.init(MetricsInfo.serviceTags(context.getInstanceName(), 
getApplicationName(),
         getAdvertiseAddress(), getResourceGroup()));
 
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/LogSorter.java 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/LogSorter.java
index f885e2b8a8..dc11899e97 100644
--- 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/LogSorter.java
+++ 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/LogSorter.java
@@ -19,9 +19,9 @@
 package org.apache.accumulo.tserver.log;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.accumulo.core.metrics.Metric.RECOVERIES_AVG_PROGRESS;
-import static org.apache.accumulo.core.metrics.Metric.RECOVERIES_IN_PROGRESS;
-import static 
org.apache.accumulo.core.metrics.Metric.RECOVERIES_LONGEST_RUNTIME;
+import static 
org.apache.accumulo.core.metrics.Metric.RECOVERIES_SORTS_AVG_PROGRESS;
+import static 
org.apache.accumulo.core.metrics.Metric.RECOVERIES_SORTS_IN_PROGRESS;
+import static 
org.apache.accumulo.core.metrics.Metric.RECOVERIES_SORTS_LONGEST_RUNTIME;
 import static 
org.apache.accumulo.core.util.threads.ThreadPoolNames.TSERVER_WAL_SORT_CONCURRENT_POOL;
 
 import java.io.DataInputStream;
@@ -382,11 +382,11 @@ public class LogSorter implements MetricsProducer {
 
   @Override
   public void registerMetrics(MeterRegistry registry) {
-    Gauge.builder(RECOVERIES_IN_PROGRESS.getName(), recoveriesInProgress, 
AtomicLong::get)
-        
.description(RECOVERIES_IN_PROGRESS.getDescription()).register(registry);
-    Gauge.builder(RECOVERIES_LONGEST_RUNTIME.getName(), recoveryRuntime, 
AtomicLong::get)
-        
.description(RECOVERIES_LONGEST_RUNTIME.getDescription()).register(registry);
-    Gauge.builder(RECOVERIES_AVG_PROGRESS.getName(), recoveryAvgProgress, 
AtomicDouble::get)
-        
.description(RECOVERIES_AVG_PROGRESS.getDescription()).register(registry);
+    Gauge.builder(RECOVERIES_SORTS_IN_PROGRESS.getName(), 
recoveriesInProgress, AtomicLong::get)
+        
.description(RECOVERIES_SORTS_IN_PROGRESS.getDescription()).register(registry);
+    Gauge.builder(RECOVERIES_SORTS_LONGEST_RUNTIME.getName(), recoveryRuntime, 
AtomicLong::get)
+        
.description(RECOVERIES_SORTS_LONGEST_RUNTIME.getDescription()).register(registry);
+    Gauge.builder(RECOVERIES_SORTS_AVG_PROGRESS.getName(), 
recoveryAvgProgress, AtomicDouble::get)
+        
.description(RECOVERIES_SORTS_AVG_PROGRESS.getDescription()).register(registry);
   }
 }
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerRecoveryMetrics.java
 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerRecoveryMetrics.java
new file mode 100644
index 0000000000..c5e0705550
--- /dev/null
+++ 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerRecoveryMetrics.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.accumulo.tserver.metrics;
+
+import static 
org.apache.accumulo.core.metrics.Metric.RECOVERIES_TABLETS_COMPLETED;
+import static 
org.apache.accumulo.core.metrics.Metric.RECOVERIES_TABLETS_FAILED;
+import static 
org.apache.accumulo.core.metrics.Metric.RECOVERIES_TABLETS_IN_PROGRESS;
+import static 
org.apache.accumulo.core.metrics.Metric.RECOVERIES_TABLETS_MUTATIONS_REPLAYED;
+import static 
org.apache.accumulo.core.metrics.Metric.RECOVERIES_TABLETS_STARTED;
+
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.accumulo.core.metrics.MetricsProducer;
+
+import io.micrometer.core.instrument.Gauge;
+import io.micrometer.core.instrument.MeterRegistry;
+
+public class TabletServerRecoveryMetrics implements MetricsProducer {
+
+  private final AtomicLong recoveriesStarted = new AtomicLong(0);
+  private final AtomicLong recoveriesCompleted = new AtomicLong(0);
+  private final AtomicLong recoveriesFailed = new AtomicLong(0);
+  private final AtomicLong concurrentRecoveries = new AtomicLong(0);
+  private final AtomicLong mutationsReplayed = new AtomicLong(0);
+
+  public void recoveryStarted() {
+    recoveriesStarted.incrementAndGet();
+    concurrentRecoveries.incrementAndGet();
+  }
+
+  public void recoveryCompleted() {
+    recoveriesCompleted.incrementAndGet();
+    concurrentRecoveries.decrementAndGet();
+  }
+
+  public void recoveryFailed() {
+    recoveriesFailed.incrementAndGet();
+    concurrentRecoveries.decrementAndGet();
+  }
+
+  public void incrementMutationsReplayed() {
+    mutationsReplayed.incrementAndGet();
+  }
+
+  @Override
+  public void registerMetrics(MeterRegistry registry) {
+    Gauge.builder(RECOVERIES_TABLETS_STARTED.getName(), recoveriesStarted, 
AtomicLong::get)
+        
.description(RECOVERIES_TABLETS_STARTED.getDescription()).register(registry);
+    Gauge.builder(RECOVERIES_TABLETS_COMPLETED.getName(), recoveriesCompleted, 
AtomicLong::get)
+        
.description(RECOVERIES_TABLETS_COMPLETED.getDescription()).register(registry);
+    Gauge.builder(RECOVERIES_TABLETS_FAILED.getName(), recoveriesFailed, 
AtomicLong::get)
+        
.description(RECOVERIES_TABLETS_FAILED.getDescription()).register(registry);
+    Gauge.builder(RECOVERIES_TABLETS_IN_PROGRESS.getName(), 
concurrentRecoveries, AtomicLong::get)
+        
.description(RECOVERIES_TABLETS_IN_PROGRESS.getDescription()).register(registry);
+    Gauge
+        .builder(RECOVERIES_TABLETS_MUTATIONS_REPLAYED.getName(), 
mutationsReplayed,
+            AtomicLong::get)
+        
.description(RECOVERIES_TABLETS_MUTATIONS_REPLAYED.getDescription()).register(registry);
+  }
+}
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
index fa38dceeb8..c80338776b 100644
--- 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
+++ 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
@@ -104,6 +104,7 @@ import org.apache.accumulo.tserver.TservConstraintEnv;
 import org.apache.accumulo.tserver.constraints.ConstraintChecker;
 import org.apache.accumulo.tserver.log.DfsLogger;
 import org.apache.accumulo.tserver.metrics.TabletServerMinCMetrics;
+import org.apache.accumulo.tserver.metrics.TabletServerRecoveryMetrics;
 import org.apache.accumulo.tserver.metrics.TabletServerScanMetrics;
 import org.apache.accumulo.tserver.scan.ScanParameters;
 import org.apache.hadoop.fs.FileStatus;
@@ -268,6 +269,8 @@ public class Tablet extends TabletBase {
 
     // don't bother examining WALs for recovery if Table is being deleted
     if (!logEntries.isEmpty() && !isBeingDeleted()) {
+      TabletServerRecoveryMetrics recoveryMetrics = 
tabletServer.getTabletRecoveryMetrics();
+      recoveryMetrics.recoveryStarted();
       TabletLogger.recovering(extent, logEntries);
       final AtomicLong entriesUsedOnTablet = new AtomicLong(0);
       // track max time from walog entries without timestamps
@@ -291,6 +294,7 @@ public class Tablet extends TabletBase {
               }
               getTabletMemory().mutate(commitSession, 
Collections.singletonList(m), 1);
               entriesUsedOnTablet.incrementAndGet();
+              recoveryMetrics.incrementMutationsReplayed();
             });
 
         if (maxTime.get() != Long.MIN_VALUE) {
@@ -322,6 +326,7 @@ public class Tablet extends TabletBase {
         }
 
       } catch (IOException | RuntimeException t) {
+        recoveryMetrics.recoveryFailed();
         String msg = "Error recovering tablet " + extent + " from log files";
         if (tableConfiguration.getBoolean(Property.TABLE_FAILURES_IGNORE)) {
           log.warn(msg, t);
@@ -337,6 +342,7 @@ public class Tablet extends TabletBase {
 
       rebuildReferencedLogs();
 
+      recoveryMetrics.recoveryCompleted();
       TabletLogger.recovered(extent, logEntries, entriesUsedOnTablet.get(),
           getTabletMemory().getNumEntries());
     }


Reply via email to