This is an automated email from the ASF dual-hosted git repository.

dlmarion pushed a commit to branch 2.1
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/2.1 by this push:
     new 041e4c1612 Changed Halt to print msg to stderr (#5574)
041e4c1612 is described below

commit 041e4c1612fe12df775d17f5af2839df70f4b584
Author: Dave Marion <dlmar...@apache.org>
AuthorDate: Wed May 28 11:54:33 2025 -0400

    Changed Halt to print msg to stderr (#5574)
    
    Logging that happens in proximity to a Runtime.halt
    may not make it to the log before the process shuts
    down if an asynchronous logging implementation is used.
    
    This change modifies the Halt methods to print any log
    message to stderr and flushing before logging to the
    logging implementation.
    
    Closes #5545
---
 .../core/fate/zookeeper/ServiceLockSupport.java    | 23 +++++-------
 .../java/org/apache/accumulo/core/util/Halt.java   | 41 +++++++++++++---------
 .../org/apache/accumulo/server/AbstractServer.java |  2 +-
 .../accumulo/server/GarbageCollectionLogger.java   |  2 +-
 .../accumulo/server/manager/LiveTServerSet.java    |  4 +--
 .../apache/accumulo/server/rpc/TServerUtils.java   |  2 +-
 .../accumulo/server/util/FileSystemMonitor.java    |  3 +-
 .../accumulo/tserver/TabletClientHandler.java      | 11 +++---
 .../org/apache/accumulo/tserver/TabletServer.java  |  3 +-
 .../accumulo/tserver/log/TabletServerLogger.java   | 14 ++++----
 .../accumulo/tserver/tablet/MinorCompactor.java    |  2 +-
 .../org/apache/accumulo/tserver/tablet/Tablet.java |  2 +-
 12 files changed, 53 insertions(+), 56 deletions(-)

diff --git 
a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java
 
b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java
index cd5c400e01..e4514e0808 100644
--- 
a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java
+++ 
b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLockSupport.java
@@ -56,16 +56,13 @@ public class ServiceLockSupport {
         LOG.warn("{} lost lock (reason = {}), not halting because shutdown is 
complete.",
             serviceName, reason);
       } else {
-        Halt.halt(serviceName + " lock in zookeeper lost (reason = " + reason 
+ "), exiting!", -1);
+        Halt.halt(-1, serviceName + " lock in zookeeper lost (reason = " + 
reason + "), exiting!");
       }
     }
 
     @Override
     public void unableToMonitorLockNode(final Exception e) {
-      // ACCUMULO-3651 Changed level to error and added FATAL to message for 
slf4j compatibility
-      Halt.halt(-1,
-          () -> LOG.error("FATAL: No longer able to monitor {} lock node", 
serviceName, e));
-
+      Halt.halt(-1, "FATAL: No longer able to monitor " + serviceName + " lock 
node", e);
     }
 
     @Override
@@ -73,7 +70,7 @@ public class ServiceLockSupport {
       LOG.debug("Acquired {} lock", serviceName);
 
       if (acquiredLock || failedToAcquireLock) {
-        Halt.halt("Zoolock in unexpected state AL " + acquiredLock + " " + 
failedToAcquireLock, -1);
+        Halt.halt(-1, "Zoolock in unexpected state AL " + acquiredLock + " " + 
failedToAcquireLock);
       }
 
       acquiredLock = true;
@@ -88,12 +85,12 @@ public class ServiceLockSupport {
         String msg =
             "Failed to acquire " + serviceName + " lock due to incorrect 
ZooKeeper authentication.";
         LOG.error("{} Ensure instance.secret is consistent across Accumulo 
configuration", msg, e);
-        Halt.halt(msg, -1);
+        Halt.halt(-1, msg);
       }
 
       if (acquiredLock) {
-        Halt.halt("Zoolock in unexpected state acquiredLock true with FAL " + 
failedToAcquireLock,
-            -1);
+        Halt.halt(-1,
+            "Zoolock in unexpected state acquiredLock true with FAL " + 
failedToAcquireLock);
       }
 
       failedToAcquireLock = true;
@@ -145,16 +142,14 @@ public class ServiceLockSupport {
         LOG.warn("{} lost lock (reason = {}), not halting because shutdown is 
complete.",
             serviceName, reason);
       } else {
-        Halt.halt(1, () -> {
-          LOG.error("{} lost lock (reason = {}), exiting.", serviceName, 
reason);
-          lostLockAction.accept(serviceName);
-        });
+        Halt.halt(1, serviceName + " lost lock (reason = " + reason + "), 
exiting.",
+            () -> lostLockAction.accept(serviceName));
       }
     }
 
     @Override
     public void unableToMonitorLockNode(final Exception e) {
-      Halt.halt(1, () -> LOG.error("Lost ability to monitor {} lock, 
exiting.", serviceName, e));
+      Halt.halt(1, "Lost ability to monitor " + serviceName + " lock, 
exiting.", e);
     }
 
   }
diff --git a/core/src/main/java/org/apache/accumulo/core/util/Halt.java 
b/core/src/main/java/org/apache/accumulo/core/util/Halt.java
index ec822b48cc..1b10b92f16 100644
--- a/core/src/main/java/org/apache/accumulo/core/util/Halt.java
+++ b/core/src/main/java/org/apache/accumulo/core/util/Halt.java
@@ -28,34 +28,43 @@ import org.slf4j.LoggerFactory;
 public class Halt {
   private static final Logger log = LoggerFactory.getLogger(Halt.class);
 
-  public static void halt(final String msg) {
-    // ACCUMULO-3651 Changed level to error and added FATAL to message for 
slf4j compatibility
-    halt(0, new Runnable() {
-      @Override
-      public void run() {
-        log.error("FATAL {}", msg);
-      }
-    });
+  public static void halt(final int status, final String msg) {
+    halt(status, msg, null, null);
   }
 
-  public static void halt(final String msg, int status) {
-    halt(status, new Runnable() {
-      @Override
-      public void run() {
-        log.error("FATAL {}", msg);
-      }
-    });
+  public static void halt(final int status, final String msg, final Throwable 
exception) {
+    halt(status, msg, exception, null);
   }
 
-  public static void halt(final int status, Runnable runnable) {
+  public static void halt(final int status, final String msg, final Runnable 
runnable) {
+    halt(status, msg, null, runnable);
+  }
 
+  public static void halt(final int status, final String msg, final Throwable 
exception,
+      final Runnable runnable) {
     try {
+
       // give ourselves a little time to try and do something
       Threads.createThread("Halt Thread", () -> {
         sleepUninterruptibly(100, MILLISECONDS);
         Runtime.getRuntime().halt(status);
       }).start();
 
+      final String errorMessage = "FATAL " + msg;
+      // Printing to stderr and to the log in case the message does not make
+      // it to the log. This could happen if an asynchronous logging impl is 
used
+      System.err.println(errorMessage);
+      if (exception != null) {
+        exception.printStackTrace();
+      }
+      System.err.flush();
+
+      if (exception != null) {
+        log.error(errorMessage, exception);
+      } else {
+        log.error(errorMessage);
+      }
+
       if (runnable != null) {
         runnable.run();
       }
diff --git 
a/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java 
b/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java
index 20f7db5b12..4bee11be06 100644
--- a/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java
+++ b/server/base/src/main/java/org/apache/accumulo/server/AbstractServer.java
@@ -251,7 +251,7 @@ public abstract class AbstractServer
                 log.trace(
                     "ServiceLockVerificationThread - checking ServiceLock 
existence in ZooKeeper");
                 if (lock != null && !lock.verifyLockAtSource()) {
-                  Halt.halt("Lock verification thread could not find lock", 
-1);
+                  Halt.halt(-1, "Lock verification thread could not find 
lock");
                 }
                 // Need to sleep, not yield when the thread priority is 
greater than NORM_PRIORITY
                 // so that this thread does not get immediately rescheduled.
diff --git 
a/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java
 
b/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java
index 469ec5632d..d3de0fa837 100644
--- 
a/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java
+++ 
b/server/base/src/main/java/org/apache/accumulo/server/GarbageCollectionLogger.java
@@ -117,7 +117,7 @@ public class GarbageCollectionLogger {
       }
 
       if (maxIncreaseInCollectionTime > keepAliveTimeout) {
-        Halt.halt("Garbage collection may be interfering with lock keep-alive. 
 Halting.", -1);
+        Halt.halt(-1, "Garbage collection may be interfering with lock 
keep-alive.  Halting.");
       }
 
       lastMemorySize = mem;
diff --git 
a/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java
 
b/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java
index 572c5954c1..1757e29580 100644
--- 
a/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java
+++ 
b/server/base/src/main/java/org/apache/accumulo/server/manager/LiveTServerSet.java
@@ -465,9 +465,7 @@ public class LiveTServerSet implements Watcher {
       context.getZooReaderWriter().recursiveDelete(fullpath, SKIP);
     } catch (Exception e) {
       String msg = "error removing tablet server lock";
-      // ACCUMULO-3651 Changed level to error and added FATAL to message for 
slf4j compatibility
-      log.error("FATAL: {}", msg, e);
-      Halt.halt(msg, -1);
+      Halt.halt(-1, msg, e);
     }
     getZooCache().clear(fullpath);
   }
diff --git 
a/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java 
b/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java
index 22a191a7a6..332e286a3f 100644
--- a/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java
+++ b/server/base/src/main/java/org/apache/accumulo/server/rpc/TServerUtils.java
@@ -664,7 +664,7 @@ public class TServerUtils {
       try {
         finalServer.serve();
       } catch (Error e) {
-        Halt.halt("Unexpected error in TThreadPoolServer " + e + ", halting.", 
1);
+        Halt.halt(1, "Unexpected error in TThreadPoolServer " + e + ", 
halting.", e);
       }
     }).start();
 
diff --git 
a/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java
 
b/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java
index 12d39bbf25..90a9c810ee 100644
--- 
a/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java
+++ 
b/server/base/src/main/java/org/apache/accumulo/server/util/FileSystemMonitor.java
@@ -121,8 +121,7 @@ public class FileSystemMonitor {
             try {
               checkMount(mount);
             } catch (final Exception e) {
-              Halt.halt(-42,
-                  () -> log.error("Exception while checking mount points, 
halting process", e));
+              Halt.halt(1, "Exception while checking mount points, halting 
process", e);
             }
           }));
     }
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java
 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java
index 1188bbdf1d..deb5b8df2c 100644
--- 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java
+++ 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletClientHandler.java
@@ -1155,11 +1155,9 @@ public class TabletClientHandler implements 
TabletClientService.Iface {
     log.trace("Got {} message from user: {}", request, 
credentials.getPrincipal());
     if (server.getLock() != null && server.getLock().wasLockAcquired()
         && !server.getLock().isLocked()) {
-      Halt.halt(1, () -> {
-        log.info("Tablet server no longer holds lock during checkPermission() 
: {}, exiting",
-            request);
-        server.getGcLogger().logGCInfo(server.getConfiguration());
-      });
+      Halt.halt(1,
+          "Tablet server no longer holds lock during checkPermission() : " + 
request + ", exiting",
+          () -> server.getGcLogger().logGCInfo(server.getConfiguration()));
     }
 
     if (lock != null) {
@@ -1344,8 +1342,7 @@ public class TabletClientHandler implements 
TabletClientService.Iface {
 
     checkPermission(context, server, credentials, lock, "halt");
 
-    Halt.halt(0, () -> {
-      log.info("Manager requested tablet server halt");
+    Halt.halt(0, "Manager requested tablet server halt", () -> {
       server.gcLogger.logGCInfo(server.getConfiguration());
       server.requestStop();
       try {
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
index 4ef6d7781f..bee674f795 100644
--- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
+++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java
@@ -946,8 +946,7 @@ public class TabletServer extends AbstractServer
       iface.tabletServerStopping(TraceUtil.traceInfo(), 
getContext().rpcCreds(),
           getClientAddressString());
     } catch (TException e) {
-      LOG.error("Error informing Manager that we are shutting down, halting 
server", e);
-      Halt.halt("Error informing Manager that we are shutting down, exiting!", 
-1);
+      Halt.halt(-1, "Error informing Manager that we are shutting down, 
exiting!", e);
     } finally {
       returnManagerConnection(iface);
     }
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
index 46970b4c21..5cc5c26e56 100644
--- 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
+++ 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/log/TabletServerLogger.java
@@ -250,9 +250,8 @@ public class TabletServerLogger {
           throw new RuntimeException(e);
         }
       } else {
-        log.error("Repeatedly failed to create WAL. Going to exit 
tabletserver.", t);
         // We didn't have retries or we failed too many times.
-        Halt.halt("Experienced too many errors creating WALs, giving up", 1);
+        Halt.halt(1, "Experienced too many errors creating WALs, giving up", 
t);
       }
 
       // The exception will trigger the log creation to be re-attempted.
@@ -395,7 +394,7 @@ public class TabletServerLogger {
 
     boolean success = false;
     while (!success) {
-      boolean sawWriteFailure = false;
+      Throwable sawWriteFailure = null;
       try {
         // get a reference to the loggers that no other thread can touch
         AtomicInteger currentId = new AtomicInteger(-1);
@@ -450,7 +449,7 @@ public class TabletServerLogger {
         writeRetry.logRetry(log, "Logs closed while writing", ex);
       } catch (Exception t) {
         writeRetry.logRetry(log, "Failed to write to WAL", t);
-        sawWriteFailure = true;
+        sawWriteFailure = t;
         try {
           // Backoff
           writeRetry.waitForNextAttempt(log, "write to WAL");
@@ -467,10 +466,11 @@ public class TabletServerLogger {
       final int finalCurrent = currentLogId;
       if (!success) {
         final ServiceLock tabletServerLock = tserver.getLock();
-        if (sawWriteFailure) {
-          log.info("WAL write failure, validating server lock in ZooKeeper");
+        if (sawWriteFailure != null) {
+          log.info("WAL write failure, validating server lock in ZooKeeper", 
sawWriteFailure);
           if (tabletServerLock == null || 
!tabletServerLock.verifyLockAtSource()) {
-            Halt.halt("Writing to WAL has failed and TabletServer lock does 
not exist", -1);
+            Halt.halt(-1, "Writing to WAL has failed and TabletServer lock 
does not exist",
+                sawWriteFailure);
           }
         }
 
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java
 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java
index 71258af11d..f265056c6a 100644
--- 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java
+++ 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/MinorCompactor.java
@@ -103,7 +103,7 @@ public class MinorCompactor extends FileCompactor {
             if (tserverLock == null || !tserverLock.verifyLockAtSource()) {
               log.error("Minor compaction of {} has failed and TabletServer 
lock does not exist."
                   + " Halting...", getExtent(), e);
-              Halt.halt("TabletServer lock does not exist", -1);
+              Halt.halt(-1, "TabletServer lock does not exist", e);
             } else {
               throw e;
             }
diff --git 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
index 246622d277..0f8773d93a 100644
--- 
a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
+++ 
b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Tablet.java
@@ -516,7 +516,7 @@ public class Tablet extends TabletBase {
         if (tserverLock == null || !tserverLock.verifyLockAtSource()) {
           log.error("Minor compaction of {} has failed and TabletServer lock 
does not exist."
               + " Halting...", getExtent(), e);
-          Halt.halt("TabletServer lock does not exist", -1);
+          Halt.halt(-1, "TabletServer lock does not exist", e);
         } else {
           TraceUtil.setException(span2, e, true);
           throw e;

Reply via email to