This is an automated email from the ASF dual-hosted git repository.

DomGarguilo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/main by this push:
     new 44030dd2fa Retain recently failed servers in monitor status and server 
views (#6348)
44030dd2fa is described below

commit 44030dd2fa42ee7741f6846db4cef513247df2d1
Author: Dom G. <[email protected]>
AuthorDate: Thu Apr 30 11:38:40 2026 -0400

    Retain recently failed servers in monitor status and server views (#6348)
    
    * Retain recently failed servers in monitor status and server views
    
    * Fix inconsistent banner on tserver page
---
 .../accumulo/monitor/next/InformationFetcher.java  |  6 +++++
 .../accumulo/monitor/next/SystemInformation.java   | 26 ++++++++++++++++------
 .../accumulo/monitor/resources/js/tservers.js      | 16 +++----------
 .../apache/accumulo/monitor/templates/tservers.ftl | 10 ---------
 4 files changed, 28 insertions(+), 30 deletions(-)

diff --git 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
index 026828c579..666bfe3296 100644
--- 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
+++ 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
@@ -127,12 +127,14 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
             HostAndPort.fromParts(server.getHost(), server.getPort()), ctx);
         try {
           MetricResponse response = 
metricsClient.getMetrics(TraceUtil.traceInfo(), ctx.rpcCreds());
+          retainedProblemServers.invalidate(server);
           summary.processResponse(server, response);
         } finally {
           ThriftUtil.returnClient(metricsClient, ctx);
         }
       } catch (Exception e) {
         LOG.warn("Error trying to get metrics from server: {}", server, e);
+        retainedProblemServers.put(server, Boolean.TRUE);
         summary.processMetricsError(server);
       }
     }
@@ -197,6 +199,7 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
   private final Supplier<Long> connectionCount;
   private final AtomicBoolean newConnectionEvent = new AtomicBoolean(false);
   private final Cache<ServerId,MetricResponse> allMetrics;
+  private final Cache<ServerId,Boolean> retainedProblemServers;
   private final AtomicReference<SystemInformation> summaryRef = new 
AtomicReference<>();
 
   public InformationFetcher(ServerContext ctx, Supplier<Long> connectionCount) 
{
@@ -204,6 +207,8 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
     this.connectionCount = connectionCount;
     this.allMetrics = 
Caffeine.newBuilder().executor(pool).scheduler(Scheduler.systemScheduler())
         
.expireAfterWrite(Duration.ofMinutes(10)).evictionListener(this::onRemoval).build();
+    this.retainedProblemServers = Caffeine.newBuilder().executor(pool)
+        
.scheduler(Scheduler.systemScheduler()).expireAfterWrite(Duration.ofMinutes(10)).build();
   }
 
   public void newConnectionEvent() {
@@ -348,6 +353,7 @@ public class InformationFetcher implements 
RemovalListener<ServerId,MetricRespon
       if (tookToLong) {
         summary.clear();
       } else {
+        
retainedProblemServers.asMap().keySet().forEach(summary::retainProblemServer);
         summary.finish();
 
         LOG.info("Finished fetching metrics from servers");
diff --git 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
index 411dcf9d39..855e442348 100644
--- 
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
+++ 
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
@@ -378,6 +378,7 @@ public class SystemInformation {
   private final Set<String> resourceGroups = ConcurrentHashMap.newKeySet();
   private final Set<ServerId> problemHosts = ConcurrentHashMap.newKeySet();
   private final Set<ServerId> metricProblemHosts = 
ConcurrentHashMap.newKeySet();
+  private final Set<ServerId> retainedProblemHosts = 
ConcurrentHashMap.newKeySet();
   private final Set<ServerId> managers = ConcurrentHashMap.newKeySet();
   private final AtomicReference<ServerId> gc = new AtomicReference<>();
 
@@ -449,6 +450,7 @@ public class SystemInformation {
     resourceGroups.clear();
     problemHosts.clear();
     metricProblemHosts.clear();
+    retainedProblemHosts.clear();
     managers.clear();
     compactors.clear();
     sservers.clear();
@@ -593,6 +595,7 @@ public class SystemInformation {
   public void processResponse(final ServerId server, final MetricResponse 
response) {
     problemHosts.remove(server);
     metricProblemHosts.remove(server);
+    retainedProblemHosts.remove(server);
     allMetrics.put(server, response);
     resourceGroups.add(response.getResourceGroup());
     deployment.computeIfAbsent(server.getResourceGroup(), g -> new 
ConcurrentHashMap<>())
@@ -669,6 +672,13 @@ public class SystemInformation {
     allMetrics.invalidate(server);
   }
 
+  public void retainProblemServer(ServerId server) {
+    problemHosts.add(server);
+    metricProblemHosts.add(server);
+    retainedProblemHosts.add(server);
+    resourceGroups.add(server.getResourceGroup().canonical());
+  }
+
   public void addConfiguredCompactionGroups(Set<String> groups) {
     configuredCompactionResourceGroups.addAll(groups);
   }
@@ -758,33 +768,28 @@ public class SystemInformation {
         .forEach((k, v) -> groupCompactions.add(new CompactionGroupSummary(k, 
v.sum())));
 
     for (final ServerId.Type type : ServerId.Type.values()) {
-      Set<ServerId> servers = new HashSet<>();
+      Set<ServerId> servers = getServers(type);
       switch (type) {
         case COMPACTOR:
-          compactors.values().forEach(servers::addAll);
           cacheServerProcessView(TableDataFactory.TableName.COMPACTORS, 
servers);
           break;
         case GARBAGE_COLLECTOR:
-          servers.add(gc.get());
           cacheServerProcessView(TableDataFactory.TableName.GC_SUMMARY, 
servers);
           cacheServerProcessView(TableDataFactory.TableName.GC_FILES, servers);
           cacheServerProcessView(TableDataFactory.TableName.GC_WALS, servers);
           break;
         case MANAGER:
-          servers.addAll(managers);
           cacheServerProcessView(TableDataFactory.TableName.MANAGERS, servers);
           cacheServerProcessView(TableDataFactory.TableName.MANAGER_FATE, 
servers);
           
cacheServerProcessView(TableDataFactory.TableName.MANAGER_COMPACTIONS, servers);
-          TableData coordinatorQueues = createCompactionQueueSummary(servers);
+          TableData coordinatorQueues = 
createCompactionQueueSummary(getActiveServers(type));
           serverMetricsView.put(TableDataFactory.TableName.COORDINATOR_QUEUES,
               memoize(() -> coordinatorQueues));
           break;
         case SCAN_SERVER:
-          sservers.values().forEach(servers::addAll);
           cacheServerProcessView(TableDataFactory.TableName.SCAN_SERVERS, 
servers);
           break;
         case TABLET_SERVER:
-          tservers.values().forEach(servers::addAll);
           cacheServerProcessView(TableDataFactory.TableName.TABLET_SERVERS, 
servers);
           break;
         case MONITOR:
@@ -848,6 +853,13 @@ public class SystemInformation {
   }
 
   private Set<ServerId> getServers(ServerId.Type type) {
+    Set<ServerId> servers = new HashSet<>(getActiveServers(type));
+    retainedProblemHosts.stream().filter(serverId -> serverId.getType() == 
type)
+        .forEach(servers::add);
+    return servers;
+  }
+
+  private Set<ServerId> getActiveServers(ServerId.Type type) {
     return switch (type) {
       case COMPACTOR -> getAll(compactors);
       case GARBAGE_COLLECTOR -> {
diff --git 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
index 80259f8f87..d877bd9612 100644
--- 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
+++ 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
@@ -67,23 +67,12 @@ function refreshTServersBanner() {
     var statusData = getStoredStatusData();
     if (getComponentStatus(statusData, 'MANAGER') === 'ERROR') {
       $('#tserversManagerBanner').show();
-      $('#tserversWarnBanner').hide();
-      $('#tserversErrorBanner').hide();
+      $(htmlBanner).hide();
       $('#tservers_wrapper').hide();
       $('#recovery-caption').hide();
     } else {
       $('#tserversManagerBanner').hide();
       $('#tservers_wrapper').show();
-      if (getComponentStatus(statusData, 'TABLET_SERVER') === 'ERROR') {
-        $('#tserversWarnBanner').hide();
-        $('#tserversErrorBanner').show();
-      } else if (getComponentStatus(statusData, 'TABLET_SERVER') === 'WARN') {
-        $('#tserversWarnBanner').show();
-        $('#tserversErrorBanner').hide();
-      } else {
-        $('#tserversWarnBanner').hide();
-        $('#tserversErrorBanner').hide();
-      }
     }
   });
 }
@@ -91,9 +80,9 @@ function refreshTServersBanner() {
 
 function refresh() {
   refreshRecoveryList();
-  refreshTServersBanner();
   refreshServerInformation(getTserversView, htmlTable, 
TABLET_SERVER_PROCESS_VIEW, htmlBanner,
     htmlBannerMessage);
+  refreshTServersBanner();
 }
 
 $(function () {
@@ -107,4 +96,5 @@ $(function () {
 
   refreshServerInformation(getTserversView, htmlTable, 
TABLET_SERVER_PROCESS_VIEW, htmlBanner,
     htmlBannerMessage);
+  refreshTServersBanner();
 });
diff --git 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
index b92d5034fc..c3929efa00 100644
--- 
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
+++ 
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
@@ -21,16 +21,6 @@
     <div id="tserversManagerBanner" style="display: none;">
       <div class="alert alert-danger" role="alert">Manager Not Running</div>
     </div>
-    <div id="tserversWarnBanner" style="display: none;">
-      <div class="alert alert-warning" role="alert">
-        One or more Tablet Servers are unavailable or reported as bad.
-      </div>
-    </div>
-    <div id="tserversErrorBanner" style="display: none;">
-      <div class="alert alert-danger" role="alert">
-        No Tablet Servers are currently responding.
-      </div>
-    </div>
     <div id="tserversStatusBanner" style="display: none;">
       <div id="tservers-banner-message" class="alert" role="alert"></div>
     </div>    

Reply via email to