This is an automated email from the ASF dual-hosted git repository.

dlmarion pushed a commit to branch 2.1
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/2.1 by this push:
     new 249fac277d Fixed concurrency issue in 
ServiceLock.determineLockOwnership (#6199)
249fac277d is described below

commit 249fac277dab776ad8274cde6cb4d615ba51ced7
Author: Dave Marion <[email protected]>
AuthorDate: Tue Mar 10 08:09:18 2026 -0400

    Fixed concurrency issue in ServiceLock.determineLockOwnership (#6199)
    
    Added some detailed comments to ServiceLock to make
    it easier to maintain. Renamed variables where it would
    make it clearer and added volatile modifiers to variables
    that were set or checked in a Watcher. Made sure that
    createdNodeName was not null before calling
    determineLockOwnership.
    
    Closes #6197
---
 .../accumulo/core/fate/zookeeper/ServiceLock.java  | 153 ++++++++++++++-------
 .../test/fate/zookeeper/ServiceLockIT.java         |   2 +-
 2 files changed, 103 insertions(+), 52 deletions(-)

diff --git 
a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLock.java 
b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLock.java
index ea963f9ec8..e52de4a036 100644
--- 
a/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLock.java
+++ 
b/core/src/main/java/org/apache/accumulo/core/fate/zookeeper/ServiceLock.java
@@ -45,6 +45,25 @@ import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.base.Preconditions;
+
+/**
+ * This class uses Sequential Ephemeral ZooKeeper Nodes
+ * 
(https://zookeeper.apache.org/doc/r3.9.5/zookeeperProgrammers.html#Sequence+Nodes+--+Unique+Naming)
+ * to implement locks for Accumulo server processes. This class will create an 
ephemeral sequential
+ * node under a base path using the prefix "zlock#" + UUID + "#". The 
ZooKeeper server will append a
+ * 10-digit zero-padded number to this prefix using a one-up counter. The base 
path could be an HA
+ * service like the Manager or a non-HA service like a TabletServer.
+ *
+ * When an instance of this class has the lowest counter number at the base 
path, then it has the
+ * lock. When an instance of this class does not have the lowest counter 
number, then it watches the
+ * node with the next lowest counter number. When the node that has the next 
lowest counter number
+ * is deleted, then this instance could acquire the lock.
+ *
+ * Instance of this class also place a Watcher on the base path node. If the 
base path Watcher
+ * receives a Session Expired event, then it calls lostLock which should end 
up halting the Accumulo
+ * server process.
+ */
 public class ServiceLock implements Watcher {
   private static final Logger LOG = LoggerFactory.getLogger(ServiceLock.class);
 
@@ -100,24 +119,52 @@ public class ServiceLock implements Watcher {
     void failedToAcquireLock(Exception e);
   }
 
+  // the base path
   private final ServiceLockPath path;
+
   protected final ZooKeeper zooKeeper;
+
+  // "zlock#" + UUID + "#"
   private final Prefix vmLockPrefix;
 
+  // A LockWatcher instance supplied to this class
+  // by the caller when trying to acquire the lock
+  // in ZooKeeper. This object does not represent
+  // a Watcher in ZooKeeper, but allows an instance
+  // of this class to communicate with the calling
+  // object by invoking callback methods.
   private LockWatcher lockWatcher;
-  private String lockNodeName;
+
+  // A variable which is initially null, then set
+  // to the createdNodeName when the lock is acquired.
+  // This variable is set to null when this instance
+  // loses the lock.
+  private volatile String lockNodeName;
+
+  // boolean to track if this instance has ever held the lock
   private volatile boolean lockWasAcquired;
-  private volatile boolean watchingParent;
 
-  private String createdNodeName;
-  private String watchingNodeName;
+  // boolean to track if there is a watcher on the base path.
+  private volatile boolean watchingBasePath;
+
+  // Represents the name of the ephemeral sequential node that
+  // the ZooKeeper server created for us with the unique one-up
+  // counter. This variable is set to null when this instance
+  // acquires the lock.
+  private volatile String createdNodeName;
+
+  // Represents the path of the ephemeral sequential node that
+  // has the next lowest counter value. An instance of this class
+  // will watch this node and will attempt to acquire the lock
+  // when this node is deleted.
+  private String watchingNodePath;
 
   public ServiceLock(ZooKeeper zookeeper, ServiceLockPath path, UUID uuid) {
     this.zooKeeper = requireNonNull(zookeeper);
     this.path = requireNonNull(path);
     try {
       zooKeeper.exists(path.toString(), this);
-      watchingParent = true;
+      watchingBasePath = true;
       this.vmLockPrefix = new Prefix(ZLOCK_PREFIX + uuid.toString() + "#");
     } catch (Exception ex) {
       LOG.error("Error setting initial watch", ex);
@@ -125,13 +172,17 @@ public class ServiceLock implements Watcher {
     }
   }
 
-  private static class LockWatcherWrapper implements AccumuloLockWatcher {
+  // Watcher used in tryLock method that wraps the supplied LockWatcher
+  // so that the supplied LockWatcher.failedToAcquireLock method is not
+  // invoked. The supplied LockWatcher.failedToAcquireLock method may
+  // halt the server, but we don't want to do that in a tryLock invocation.
+  private static class TryLockWatcherWrapper implements AccumuloLockWatcher {
 
-    boolean acquiredLock = false;
-    LockWatcher lw;
+    private boolean acquiredLock = false;
+    private final LockWatcher delegate;
 
-    public LockWatcherWrapper(LockWatcher lw2) {
-      this.lw = lw2;
+    public TryLockWatcherWrapper(LockWatcher lw2) {
+      this.delegate = lw2;
     }
 
     @Override
@@ -146,12 +197,12 @@ public class ServiceLock implements Watcher {
 
     @Override
     public void lostLock(LockLossReason reason) {
-      lw.lostLock(reason);
+      delegate.lostLock(reason);
     }
 
     @Override
     public void unableToMonitorLockNode(Exception e) {
-      lw.unableToMonitorLockNode(e);
+      delegate.unableToMonitorLockNode(e);
     }
 
   }
@@ -159,7 +210,7 @@ public class ServiceLock implements Watcher {
   public synchronized boolean tryLock(LockWatcher lw, byte[] data)
       throws KeeperException, InterruptedException {
 
-    LockWatcherWrapper lww = new LockWatcherWrapper(lw);
+    TryLockWatcherWrapper lww = new TryLockWatcherWrapper(lw);
 
     lock(lww, data);
 
@@ -276,31 +327,27 @@ public class ServiceLock implements Watcher {
     return lowestPrevNode;
   }
 
-  private synchronized void determineLockOwnership(final String 
createdEphemeralNode,
-      final AccumuloLockWatcher lw) throws KeeperException, 
InterruptedException {
+  private synchronized void determineLockOwnership(final AccumuloLockWatcher 
lw)
+      throws KeeperException, InterruptedException {
 
-    if (createdNodeName == null) {
-      throw new IllegalStateException(
-          "Called determineLockOwnership() when ephemeralNodeName == null");
-    }
+    Preconditions.checkState(createdNodeName != null, "createdNodeName cannot 
be null");
 
     List<String> children = validateAndSort(path, 
zooKeeper.getChildren(path.toString(), null));
 
-    if (!children.contains(createdEphemeralNode)) {
-      LOG.error("Expected ephemeral node {} to be in the list of children {}", 
createdEphemeralNode,
+    if (!children.contains(createdNodeName)) {
+      LOG.error("Expected ephemeral node {} to be in the list of children {}", 
createdNodeName,
           children);
-      throw new RuntimeException(
-          "Lock attempt ephemeral node no longer exist " + 
createdEphemeralNode);
+      throw new RuntimeException("Lock attempt ephemeral node no longer exist 
" + createdNodeName);
     }
 
-    if (children.get(0).equals(createdEphemeralNode)) {
+    if (children.get(0).equals(createdNodeName)) {
       LOG.debug("[{}] First candidate is my lock, acquiring...", vmLockPrefix);
-      if (!watchingParent) {
+      if (!watchingBasePath) {
         throw new IllegalStateException(
             "Can not acquire lock, no longer watching parent : " + path);
       }
       this.lockWatcher = lw;
-      this.lockNodeName = createdEphemeralNode;
+      this.lockNodeName = createdNodeName;
       createdNodeName = null;
       lockWasAcquired = true;
       lw.acquiredLock();
@@ -308,16 +355,16 @@ public class ServiceLock implements Watcher {
       LOG.debug("[{}] Lock held by another process with ephemeral node: {}", 
vmLockPrefix,
           children.get(0));
 
-      String lowestPrevNode = findLowestPrevPrefix(children, 
createdEphemeralNode);
+      String lowestPrevNode = findLowestPrevPrefix(children, createdNodeName);
 
-      watchingNodeName = path + "/" + lowestPrevNode;
-      final String nodeToWatch = watchingNodeName;
+      watchingNodePath = path + "/" + lowestPrevNode;
+      final String nodeToWatch = watchingNodePath;
       LOG.debug("[{}] Establishing watch on prior node {}", vmLockPrefix, 
nodeToWatch);
       Watcher priorNodeWatcher = new Watcher() {
         @Override
         public void process(WatchedEvent event) {
-          if (LOG.isTraceEnabled()) {
-            LOG.trace("[{}] Processing {}", vmLockPrefix, event);
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("[{}] Processing {}", vmLockPrefix, event);
           }
           boolean renew = true;
           if (event.getType() == EventType.NodeDeleted && 
event.getPath().equals(nodeToWatch)) {
@@ -326,10 +373,10 @@ public class ServiceLock implements Watcher {
             synchronized (ServiceLock.this) {
               try {
                 if (createdNodeName != null) {
-                  determineLockOwnership(createdEphemeralNode, lw);
+                  determineLockOwnership(lw);
                 } else if (LOG.isDebugEnabled()) {
                   LOG.debug("[{}] While waiting for another lock {}, {} was 
deleted; {}",
-                      vmLockPrefix, nodeToWatch, createdEphemeralNode, event);
+                      vmLockPrefix, nodeToWatch, createdNodeName, event);
                 }
               } catch (Exception e) {
                 if (lockNodeName == null) {
@@ -353,20 +400,24 @@ public class ServiceLock implements Watcher {
             renew = false;
           }
           if (renew) {
-            try {
-              Stat restat = zooKeeper.exists(nodeToWatch, this);
-              if (restat == null) {
-                // if stat is null from the zookeeper.exists(path, Watcher) 
call, then we just
-                // created a Watcher on a node that does not exist. Delete the 
watcher we just
-                // created.
-                zooKeeper.removeWatches(nodeToWatch, this, WatcherType.Any, 
true);
-                determineLockOwnership(createdEphemeralNode, lw);
-              } else {
-                LOG.debug("[{}] Renewed watch on prior node  {}", 
vmLockPrefix, nodeToWatch);
+            synchronized (ServiceLock.this) {
+              if (createdNodeName != null) {
+                try {
+                  Stat restat = zooKeeper.exists(nodeToWatch, this);
+                  if (restat == null) {
+                    // if stat is null from the zookeeper.exists(path, 
Watcher) call, then we just
+                    // created a Watcher on a node that does not exist. Delete 
the watcher we just
+                    // created.
+                    zooKeeper.removeWatches(nodeToWatch, this, 
WatcherType.Any, true);
+                    determineLockOwnership(lw);
+                  } else {
+                    LOG.debug("[{}] Renewed watch on prior node {}", 
vmLockPrefix, nodeToWatch);
+                  }
+                } catch (KeeperException | InterruptedException e) {
+                  lw.failedToAcquireLock(
+                      new Exception("Failed to renew watch on prior node: " + 
nodeToWatch, e));
+                }
               }
-            } catch (KeeperException | InterruptedException e) {
-              lw.failedToAcquireLock(
-                  new Exception("Failed to renew watch on other manager node", 
e));
             }
           }
         }
@@ -378,7 +429,7 @@ public class ServiceLock implements Watcher {
         // if stat is null from the zookeeper.exists(path, Watcher) call, then 
we just
         // created a Watcher on a node that does not exist. Delete the watcher 
we just created.
         zooKeeper.removeWatches(nodeToWatch, priorNodeWatcher, 
WatcherType.Any, true);
-        determineLockOwnership(createdEphemeralNode, lw);
+        determineLockOwnership(lw);
       }
     }
 
@@ -521,7 +572,7 @@ public class ServiceLock implements Watcher {
       createdNodeName = pathForWatcher.substring(path.toString().length() + 1);
 
       // We have created a node, do we own the lock?
-      determineLockOwnership(createdNodeName, lw);
+      determineLockOwnership(lw);
 
     } catch (KeeperException | InterruptedException e) {
       lw.failedToAcquireLock(e);
@@ -570,7 +621,7 @@ public class ServiceLock implements Watcher {
    * @return path of node that this lock is watching
    */
   public synchronized String getWatching() {
-    return watchingNodeName;
+    return watchingNodePath;
   }
 
   public synchronized String getLockPath() {
@@ -617,7 +668,7 @@ public class ServiceLock implements Watcher {
       LOG.debug("{}", event);
     }
 
-    watchingParent = false;
+    watchingBasePath = false;
 
     if (event.getState() == KeeperState.Expired && lockNodeName != null) {
       lostLock(LockLossReason.SESSION_EXPIRED);
@@ -625,7 +676,7 @@ public class ServiceLock implements Watcher {
 
       try { // set the watch on the parent node again
         zooKeeper.exists(path.toString(), this);
-        watchingParent = true;
+        watchingBasePath = true;
       } catch (KeeperException.ConnectionLossException ex) {
         // we can't look at the lock because we aren't connected, but our 
session is still good
         LOG.warn("lost connection to zookeeper", ex);
diff --git 
a/test/src/main/java/org/apache/accumulo/test/fate/zookeeper/ServiceLockIT.java 
b/test/src/main/java/org/apache/accumulo/test/fate/zookeeper/ServiceLockIT.java
index 75ba3d6c36..ed4efda820 100644
--- 
a/test/src/main/java/org/apache/accumulo/test/fate/zookeeper/ServiceLockIT.java
+++ 
b/test/src/main/java/org/apache/accumulo/test/fate/zookeeper/ServiceLockIT.java
@@ -684,7 +684,7 @@ public class ServiceLockIT {
 
       // make sure still watching parent even though a lot of events occurred 
for the parent
       synchronized (zl) {
-        Field field = zl.getClass().getDeclaredField("watchingParent");
+        Field field = zl.getClass().getDeclaredField("watchingBasePath");
         field.setAccessible(true);
         assertTrue((Boolean) field.get(zl));
       }

Reply via email to