This is an automated email from the ASF dual-hosted git repository. dlmarion pushed a commit to branch 2.1 in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/2.1 by this push: new 041e4c1612 Changed Halt to print msg to stderr (#5574) 041e4c1612 is described below commit 041e4c1612fe12df775d17f5af2839df70f4b584 Author: Dave Marion <dlmar...@apache.org> AuthorDate: Wed May 28 11:54:33 2025 -0400 Changed Halt to print msg to stderr (#5574) Logging that happens in proximity to a Runtime.halt may not make it to the log before the process shuts down if an asynchronous logging implementation is used. This change modifies the Halt methods to print any log message to stderr and flushing before logging to the logging implementation. Closes #5545 --- .../core/fate/zookeeper/ServiceLockSupport.java | 23 +++++------- .../java/org/apache/accumulo/core/util/Halt.java | 41 +++++++++++++--------- .../org/apache/accumulo/server/AbstractServer.java | 2 +- .../accumulo/server/GarbageCollectionLogger.java | 2 +- .../accumulo/server/manager/LiveTServerSet.java | 4 +-- .../apache/accumulo/server/rpc/TServerUtils.java | 2 +- .../accumulo/server/util/FileSystemMonitor.java | 3 +- .../accumulo/tserver/TabletClientHandler.java | 11 +++--- .../org/apache/accumulo/tserver/TabletServer.java | 3 +- .../accumulo/tserver/log/TabletServerLogger.java | 14 ++++---- .../accumulo/tserver/tablet/MinorCompactor.java | 2 +- .../org/apache/accumulo/tserver/tablet/Tablet.java | 2 +- 12 files changed, 53 insertions(+), 56 deletions(-) diff --git a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java index cd5c400e01..e4514e0808 100644 --- a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java +++ b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java @@ -56,16 +56,13 @@ public class ServiceLockSupport { LOG.warn("{} lost lock (reason = {}), not halting because shutdown is complete.", serviceName, reason); } else { - Halt.halt(serviceName + " lock in zookeeper lost (reason = " + reason + "), exiting!", -1); + Halt.halt(-1, serviceName + " lock in zookeeper lost (reason = " + reason + "), exiting!"); } } @Override public void unableToMonitorLockNode(final Exception e) { - // ACCUMULO-3651 Changed level to error and added FATAL to message for slf4j compatibility - Halt.halt(-1, - () -> LOG.error("FATAL: No longer able to monitor {} lock node", serviceName, e)); - + Halt.halt(-1, "FATAL: No longer able to monitor " + serviceName + " lock node", e); } @Override @@ -73,7 +70,7 @@ public class ServiceLockSupport { LOG.debug("Acquired {} lock", serviceName); if (acquiredLock || failedToAcquireLock) { - Halt.halt("Zoolock in unexpected state AL " + acquiredLock + " " + failedToAcquireLock, -1); + Halt.halt(-1, "Zoolock in unexpected state AL " + acquiredLock + " " + failedToAcquireLock); } acquiredLock = true; @@ -88,12 +85,12 @@ public class ServiceLockSupport { String msg = "Failed to acquire " + serviceName + " lock due to incorrect ZooKeeper authentication."; LOG.error("{} Ensure instance.secret is consistent across Accumulo configuration", msg, e); - Halt.halt(msg, -1); + Halt.halt(-1, msg); } if (acquiredLock) { - Halt.halt("Zoolock in unexpected state acquiredLock true with FAL " + failedToAcquireLock, - -1); + Halt.halt(-1, + "Zoolock in unexpected state acquiredLock true with FAL " + failedToAcquireLock); } failedToAcquireLock = true; @@ -145,16 +142,14 @@ public class ServiceLockSupport { LOG.warn("{} lost lock (reason = {}), not halting because shutdown is complete.", serviceName, reason); } else { - Halt.halt(1, () -> { - LOG.error("{} lost lock (reason = {}), exiting.", serviceName, reason); - lostLockAction.accept(serviceName); - }); + Halt.halt(1, serviceName + " lost lock (reason = " + reason + "), exiting.", + () -> lostLockAction.accept(serviceName)); } } @Override public void unableToMonitorLockNode(final Exception e) { - Halt.halt(1, () -> LOG.error("Lost ability to monitor {} lock, exiting.", serviceName, e)); + Halt.halt(1, "Lost ability to monitor " + serviceName + " lock, exiting.", e); } } diff --git a/core/src/main/java/org/apache/accumulo/core/util/Halt.java b/core/src/main/java/org/apache/accumulo/core/util/Halt.java index ec822b48cc..1b10b92f16 100644 --- a/core/src/main/java/org/apache/accumulo/core/util/Halt.java +++ b/core/src/main/java/org/apache/accumulo/core/util/Halt.java @@ -28,34 +28,43 @@ import org.slf4j.LoggerFactory; public class Halt { private static final Logger log = LoggerFactory.getLogger(Halt.class); - public static void halt(final String msg) { - // ACCUMULO-3651 Changed level to error and added FATAL to message for slf4j compatibility - halt(0, new Runnable() { - @Override - public void run() { - log.error("FATAL {}", msg); - } - }); + public static void halt(final int status, final String msg) { + halt(status, msg, null, null); } - public static void halt(final String msg, int status) { - halt(status, new Runnable() { - @Override - public void run() { - log.error("FATAL {}", msg); - } - }); + public static void halt(final int status, final String msg, final Throwable exception) { + halt(status, msg, exception, null); } - public static void halt(final int status, Runnable runnable) { + public static void halt(final int status, final String msg, final Runnable runnable) { + halt(status, msg, null, runnable); + } + public static void halt(final int status, final String msg, final Throwable exception, + final Runnable runnable) { try { + // give ourselves a little time to try and do something Threads.createThread("Halt Thread", () -> { sleepUninterruptibly(100, MILLISECONDS); Runtime.getRuntime().halt(status); }).start(); + final String errorMessage = "FATAL " + msg; + // Printing to stderr and to the log in case the message does not make + // it to the log. This could happen if an asynchronous logging impl is used + System.err.println(errorMessage); + if (exception != null) { + exception.printStackTrace(); + } + System.err.flush(); + + if (exception != null) { + log.error(errorMessage, exception); + } else { + log.error(errorMessage); + } + if (runnable != null) { runnable.run(); } diff --git a/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java b/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java index 20f7db5b12..4bee11be06 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java +++ b/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java @@ -251,7 +251,7 @@ public abstract class AbstractServer log.trace( "ServiceLockVerificationThread - checking ServiceLock existence in ZooKeeper"); if (lock != null && !lock.verifyLockAtSource()) { - Halt.halt("Lock verification thread could not find lock", -1); + Halt.halt(-1, "Lock verification thread could not find lock"); } // Need to sleep, not yield when the thread priority is greater than NORM_PRIORITY // so that this thread does not get immediately rescheduled. diff --git a/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java b/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java index 469ec5632d..d3de0fa837 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java +++ b/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java @@ -117,7 +117,7 @@ public class GarbageCollectionLogger { } if (maxIncreaseInCollectionTime > keepAliveTimeout) { - Halt.halt("Garbage collection may be interfering with lock keep-alive. Halting.", -1); + Halt.halt(-1, "Garbage collection may be interfering with lock keep-alive. Halting."); } lastMemorySize = mem; diff --git a/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java b/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java index 572c5954c1..1757e29580 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java +++ b/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java @@ -465,9 +465,7 @@ public class LiveTServerSet implements Watcher { context.getZooReaderWriter().recursiveDelete(fullpath, SKIP); } catch (Exception e) { String msg = "error removing tablet server lock"; - // ACCUMULO-3651 Changed level to error and added FATAL to message for slf4j compatibility - log.error("FATAL: {}", msg, e); - Halt.halt(msg, -1); + Halt.halt(-1, msg, e); } getZooCache().clear(fullpath); } diff --git a/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java b/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java index 22a191a7a6..332e286a3f 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java +++ b/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java @@ -664,7 +664,7 @@ public class TServerUtils { try { finalServer.serve(); } catch (Error e) { - Halt.halt("Unexpected error in TThreadPoolServer " + e + ", halting.", 1); + Halt.halt(1, "Unexpected error in TThreadPoolServer " + e + ", halting.", e); } }).start(); diff --git a/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java b/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java index 12d39bbf25..90a9c810ee 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java +++ b/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java @@ -121,8 +121,7 @@ public class FileSystemMonitor { try { checkMount(mount); } catch (final Exception e) { - Halt.halt(-42, - () -> log.error("Exception while checking mount points, halting process", e)); + Halt.halt(1, "Exception while checking mount points, halting process", e); } })); } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java index 1188bbdf1d..deb5b8df2c 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java @@ -1155,11 +1155,9 @@ public class TabletClientHandler implements TabletClientService.Iface { log.trace("Got {} message from user: {}", request, credentials.getPrincipal()); if (server.getLock() != null && server.getLock().wasLockAcquired() && !server.getLock().isLocked()) { - Halt.halt(1, () -> { - log.info("Tablet server no longer holds lock during checkPermission() : {}, exiting", - request); - server.getGcLogger().logGCInfo(server.getConfiguration()); - }); + Halt.halt(1, + "Tablet server no longer holds lock during checkPermission() : " + request + ", exiting", + () -> server.getGcLogger().logGCInfo(server.getConfiguration())); } if (lock != null) { @@ -1344,8 +1342,7 @@ public class TabletClientHandler implements TabletClientService.Iface { checkPermission(context, server, credentials, lock, "halt"); - Halt.halt(0, () -> { - log.info("Manager requested tablet server halt"); + Halt.halt(0, "Manager requested tablet server halt", () -> { server.gcLogger.logGCInfo(server.getConfiguration()); server.requestStop(); try { diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java index 4ef6d7781f..bee674f795 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java @@ -946,8 +946,7 @@ public class TabletServer extends AbstractServer iface.tabletServerStopping(TraceUtil.traceInfo(), getContext().rpcCreds(), getClientAddressString()); } catch (TException e) { - LOG.error("Error informing Manager that we are shutting down, halting server", e); - Halt.halt("Error informing Manager that we are shutting down, exiting!", -1); + Halt.halt(-1, "Error informing Manager that we are shutting down, exiting!", e); } finally { returnManagerConnection(iface); } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java index 46970b4c21..5cc5c26e56 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java @@ -250,9 +250,8 @@ public class TabletServerLogger { throw new RuntimeException(e); } } else { - log.error("Repeatedly failed to create WAL. Going to exit tabletserver.", t); // We didn't have retries or we failed too many times. - Halt.halt("Experienced too many errors creating WALs, giving up", 1); + Halt.halt(1, "Experienced too many errors creating WALs, giving up", t); } // The exception will trigger the log creation to be re-attempted. @@ -395,7 +394,7 @@ public class TabletServerLogger { boolean success = false; while (!success) { - boolean sawWriteFailure = false; + Throwable sawWriteFailure = null; try { // get a reference to the loggers that no other thread can touch AtomicInteger currentId = new AtomicInteger(-1); @@ -450,7 +449,7 @@ public class TabletServerLogger { writeRetry.logRetry(log, "Logs closed while writing", ex); } catch (Exception t) { writeRetry.logRetry(log, "Failed to write to WAL", t); - sawWriteFailure = true; + sawWriteFailure = t; try { // Backoff writeRetry.waitForNextAttempt(log, "write to WAL"); @@ -467,10 +466,11 @@ public class TabletServerLogger { final int finalCurrent = currentLogId; if (!success) { final ServiceLock tabletServerLock = tserver.getLock(); - if (sawWriteFailure) { - log.info("WAL write failure, validating server lock in ZooKeeper"); + if (sawWriteFailure != null) { + log.info("WAL write failure, validating server lock in ZooKeeper", sawWriteFailure); if (tabletServerLock == null || !tabletServerLock.verifyLockAtSource()) { - Halt.halt("Writing to WAL has failed and TabletServer lock does not exist", -1); + Halt.halt(-1, "Writing to WAL has failed and TabletServer lock does not exist", + sawWriteFailure); } } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java index 71258af11d..f265056c6a 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java @@ -103,7 +103,7 @@ public class MinorCompactor extends FileCompactor { if (tserverLock == null || !tserverLock.verifyLockAtSource()) { log.error("Minor compaction of {} has failed and TabletServer lock does not exist." + " Halting...", getExtent(), e); - Halt.halt("TabletServer lock does not exist", -1); + Halt.halt(-1, "TabletServer lock does not exist", e); } else { throw e; } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java index 246622d277..0f8773d93a 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java @@ -516,7 +516,7 @@ public class Tablet extends TabletBase { if (tserverLock == null || !tserverLock.verifyLockAtSource()) { log.error("Minor compaction of {} has failed and TabletServer lock does not exist." + " Halting...", getExtent(), e); - Halt.halt("TabletServer lock does not exist", -1); + Halt.halt(-1, "TabletServer lock does not exist", e); } else { TraceUtil.setException(span2, e, true); throw e;