This is an automated email from the ASF dual-hosted git repository.
DomGarguilo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/main by this push:
new 44030dd2fa Retain recently failed servers in monitor status and server
views (#6348)
44030dd2fa is described below
commit 44030dd2fa42ee7741f6846db4cef513247df2d1
Author: Dom G. <[email protected]>
AuthorDate: Thu Apr 30 11:38:40 2026 -0400
Retain recently failed servers in monitor status and server views (#6348)
* Retain recently failed servers in monitor status and server views
* Fix inconsistent banner on tserver page
---
.../accumulo/monitor/next/InformationFetcher.java | 6 +++++
.../accumulo/monitor/next/SystemInformation.java | 26 ++++++++++++++++------
.../accumulo/monitor/resources/js/tservers.js | 16 +++----------
.../apache/accumulo/monitor/templates/tservers.ftl | 10 ---------
4 files changed, 28 insertions(+), 30 deletions(-)
diff --git
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
index 026828c579..666bfe3296 100644
---
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
+++
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java
@@ -127,12 +127,14 @@ public class InformationFetcher implements
RemovalListener<ServerId,MetricRespon
HostAndPort.fromParts(server.getHost(), server.getPort()), ctx);
try {
MetricResponse response =
metricsClient.getMetrics(TraceUtil.traceInfo(), ctx.rpcCreds());
+ retainedProblemServers.invalidate(server);
summary.processResponse(server, response);
} finally {
ThriftUtil.returnClient(metricsClient, ctx);
}
} catch (Exception e) {
LOG.warn("Error trying to get metrics from server: {}", server, e);
+ retainedProblemServers.put(server, Boolean.TRUE);
summary.processMetricsError(server);
}
}
@@ -197,6 +199,7 @@ public class InformationFetcher implements
RemovalListener<ServerId,MetricRespon
private final Supplier<Long> connectionCount;
private final AtomicBoolean newConnectionEvent = new AtomicBoolean(false);
private final Cache<ServerId,MetricResponse> allMetrics;
+ private final Cache<ServerId,Boolean> retainedProblemServers;
private final AtomicReference<SystemInformation> summaryRef = new
AtomicReference<>();
public InformationFetcher(ServerContext ctx, Supplier<Long> connectionCount)
{
@@ -204,6 +207,8 @@ public class InformationFetcher implements
RemovalListener<ServerId,MetricRespon
this.connectionCount = connectionCount;
this.allMetrics =
Caffeine.newBuilder().executor(pool).scheduler(Scheduler.systemScheduler())
.expireAfterWrite(Duration.ofMinutes(10)).evictionListener(this::onRemoval).build();
+ this.retainedProblemServers = Caffeine.newBuilder().executor(pool)
+
.scheduler(Scheduler.systemScheduler()).expireAfterWrite(Duration.ofMinutes(10)).build();
}
public void newConnectionEvent() {
@@ -348,6 +353,7 @@ public class InformationFetcher implements
RemovalListener<ServerId,MetricRespon
if (tookToLong) {
summary.clear();
} else {
+
retainedProblemServers.asMap().keySet().forEach(summary::retainProblemServer);
summary.finish();
LOG.info("Finished fetching metrics from servers");
diff --git
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
index 411dcf9d39..855e442348 100644
---
a/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
+++
b/server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java
@@ -378,6 +378,7 @@ public class SystemInformation {
private final Set<String> resourceGroups = ConcurrentHashMap.newKeySet();
private final Set<ServerId> problemHosts = ConcurrentHashMap.newKeySet();
private final Set<ServerId> metricProblemHosts =
ConcurrentHashMap.newKeySet();
+ private final Set<ServerId> retainedProblemHosts =
ConcurrentHashMap.newKeySet();
private final Set<ServerId> managers = ConcurrentHashMap.newKeySet();
private final AtomicReference<ServerId> gc = new AtomicReference<>();
@@ -449,6 +450,7 @@ public class SystemInformation {
resourceGroups.clear();
problemHosts.clear();
metricProblemHosts.clear();
+ retainedProblemHosts.clear();
managers.clear();
compactors.clear();
sservers.clear();
@@ -593,6 +595,7 @@ public class SystemInformation {
public void processResponse(final ServerId server, final MetricResponse
response) {
problemHosts.remove(server);
metricProblemHosts.remove(server);
+ retainedProblemHosts.remove(server);
allMetrics.put(server, response);
resourceGroups.add(response.getResourceGroup());
deployment.computeIfAbsent(server.getResourceGroup(), g -> new
ConcurrentHashMap<>())
@@ -669,6 +672,13 @@ public class SystemInformation {
allMetrics.invalidate(server);
}
+ public void retainProblemServer(ServerId server) {
+ problemHosts.add(server);
+ metricProblemHosts.add(server);
+ retainedProblemHosts.add(server);
+ resourceGroups.add(server.getResourceGroup().canonical());
+ }
+
public void addConfiguredCompactionGroups(Set<String> groups) {
configuredCompactionResourceGroups.addAll(groups);
}
@@ -758,33 +768,28 @@ public class SystemInformation {
.forEach((k, v) -> groupCompactions.add(new CompactionGroupSummary(k,
v.sum())));
for (final ServerId.Type type : ServerId.Type.values()) {
- Set<ServerId> servers = new HashSet<>();
+ Set<ServerId> servers = getServers(type);
switch (type) {
case COMPACTOR:
- compactors.values().forEach(servers::addAll);
cacheServerProcessView(TableDataFactory.TableName.COMPACTORS,
servers);
break;
case GARBAGE_COLLECTOR:
- servers.add(gc.get());
cacheServerProcessView(TableDataFactory.TableName.GC_SUMMARY,
servers);
cacheServerProcessView(TableDataFactory.TableName.GC_FILES, servers);
cacheServerProcessView(TableDataFactory.TableName.GC_WALS, servers);
break;
case MANAGER:
- servers.addAll(managers);
cacheServerProcessView(TableDataFactory.TableName.MANAGERS, servers);
cacheServerProcessView(TableDataFactory.TableName.MANAGER_FATE,
servers);
cacheServerProcessView(TableDataFactory.TableName.MANAGER_COMPACTIONS, servers);
- TableData coordinatorQueues = createCompactionQueueSummary(servers);
+ TableData coordinatorQueues =
createCompactionQueueSummary(getActiveServers(type));
serverMetricsView.put(TableDataFactory.TableName.COORDINATOR_QUEUES,
memoize(() -> coordinatorQueues));
break;
case SCAN_SERVER:
- sservers.values().forEach(servers::addAll);
cacheServerProcessView(TableDataFactory.TableName.SCAN_SERVERS,
servers);
break;
case TABLET_SERVER:
- tservers.values().forEach(servers::addAll);
cacheServerProcessView(TableDataFactory.TableName.TABLET_SERVERS,
servers);
break;
case MONITOR:
@@ -848,6 +853,13 @@ public class SystemInformation {
}
private Set<ServerId> getServers(ServerId.Type type) {
+ Set<ServerId> servers = new HashSet<>(getActiveServers(type));
+ retainedProblemHosts.stream().filter(serverId -> serverId.getType() ==
type)
+ .forEach(servers::add);
+ return servers;
+ }
+
+ private Set<ServerId> getActiveServers(ServerId.Type type) {
return switch (type) {
case COMPACTOR -> getAll(compactors);
case GARBAGE_COLLECTOR -> {
diff --git
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
index 80259f8f87..d877bd9612 100644
---
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
+++
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/resources/js/tservers.js
@@ -67,23 +67,12 @@ function refreshTServersBanner() {
var statusData = getStoredStatusData();
if (getComponentStatus(statusData, 'MANAGER') === 'ERROR') {
$('#tserversManagerBanner').show();
- $('#tserversWarnBanner').hide();
- $('#tserversErrorBanner').hide();
+ $(htmlBanner).hide();
$('#tservers_wrapper').hide();
$('#recovery-caption').hide();
} else {
$('#tserversManagerBanner').hide();
$('#tservers_wrapper').show();
- if (getComponentStatus(statusData, 'TABLET_SERVER') === 'ERROR') {
- $('#tserversWarnBanner').hide();
- $('#tserversErrorBanner').show();
- } else if (getComponentStatus(statusData, 'TABLET_SERVER') === 'WARN') {
- $('#tserversWarnBanner').show();
- $('#tserversErrorBanner').hide();
- } else {
- $('#tserversWarnBanner').hide();
- $('#tserversErrorBanner').hide();
- }
}
});
}
@@ -91,9 +80,9 @@ function refreshTServersBanner() {
function refresh() {
refreshRecoveryList();
- refreshTServersBanner();
refreshServerInformation(getTserversView, htmlTable,
TABLET_SERVER_PROCESS_VIEW, htmlBanner,
htmlBannerMessage);
+ refreshTServersBanner();
}
$(function () {
@@ -107,4 +96,5 @@ $(function () {
refreshServerInformation(getTserversView, htmlTable,
TABLET_SERVER_PROCESS_VIEW, htmlBanner,
htmlBannerMessage);
+ refreshTServersBanner();
});
diff --git
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
index b92d5034fc..c3929efa00 100644
---
a/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
+++
b/server/monitor/src/main/resources/org/apache/accumulo/monitor/templates/tservers.ftl
@@ -21,16 +21,6 @@
<div id="tserversManagerBanner" style="display: none;">
<div class="alert alert-danger" role="alert">Manager Not Running</div>
</div>
- <div id="tserversWarnBanner" style="display: none;">
- <div class="alert alert-warning" role="alert">
- One or more Tablet Servers are unavailable or reported as bad.
- </div>
- </div>
- <div id="tserversErrorBanner" style="display: none;">
- <div class="alert alert-danger" role="alert">
- No Tablet Servers are currently responding.
- </div>
- </div>
<div id="tserversStatusBanner" style="display: none;">
<div id="tservers-banner-message" class="alert" role="alert"></div>
</div>