(ratis) branch RATIS-1931_grpc-zero-copy updated: RATIS-2501. Improve diagnostics for testInstallSnapshotDuringBootstrap timeout failures. (#1427)

szetszwo Sun, 12 Apr 2026 08:43:34 -0700

This is an automated email from the ASF dual-hosted git repository.

szetszwo pushed a commit to branch RATIS-1931_grpc-zero-copy
in repository https://gitbox.apache.org/repos/asf/ratis.git



The following commit(s) were added to refs/heads/RATIS-1931_grpc-zero-copy by 
this push:
     new f76cb2e2d RATIS-2501. Improve diagnostics for 
testInstallSnapshotDuringBootstrap timeout failures. (#1427)
f76cb2e2d is described below

commit f76cb2e2dca5101f4d2e7e85e43908b0036e531d
Author: slfan1989 <[email protected]>
AuthorDate: Sun Apr 12 23:42:50 2026 +0800

    RATIS-2501. Improve diagnostics for testInstallSnapshotDuringBootstrap 
timeout failures. (#1427)
---
 .../ratis/InstallSnapshotNotificationTests.java    | 113 ++++++++++++++++++++-
 1 file changed, 111 insertions(+), 2 deletions(-)

diff --git 
a/ratis-server/src/test/java/org/apache/ratis/InstallSnapshotNotificationTests.java
 
b/ratis-server/src/test/java/org/apache/ratis/InstallSnapshotNotificationTests.java
index 411c93120..f19f91f79 100644
--- 
a/ratis-server/src/test/java/org/apache/ratis/InstallSnapshotNotificationTests.java
+++ 
b/ratis-server/src/test/java/org/apache/ratis/InstallSnapshotNotificationTests.java
@@ -23,6 +23,7 @@ import org.apache.ratis.proto.RaftProtos;
 import org.apache.ratis.protocol.RaftClientReply;
 import org.apache.ratis.protocol.RaftPeer;
 import org.apache.ratis.protocol.RaftPeerId;
+import org.apache.ratis.retry.RetryPolicies;
 import org.apache.ratis.server.RaftServer;
 import org.apache.ratis.server.RaftServerConfigKeys;
 import org.apache.ratis.server.impl.MiniRaftCluster;
@@ -47,11 +48,13 @@ import org.slf4j.event.Level;
 
 import java.io.File;
 import java.io.IOException;
+import java.io.InterruptedIOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.Optional;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.Executor;
 import java.util.concurrent.Executors;
@@ -83,6 +86,11 @@ public abstract class 
InstallSnapshotNotificationTests<CLUSTER extends MiniRaftC
 
   private static final int SNAPSHOT_TRIGGER_THRESHOLD = 64;
   private static final int PURGE_GAP = 8;
+  /**
+   * Keep the retry budget bounded so that a stuck reconfiguration surfaces as 
a test failure with
+   * diagnostics, instead of silently retrying until the outer JUnit timeout 
aborts the test.
+   */
+  private static final int SET_CONFIGURATION_MAX_ATTEMPTS = 20;
   private static final AtomicReference<SnapshotInfo> LEADER_SNAPSHOT_INFO_REF 
= new AtomicReference<>();
 
   private static final AtomicInteger numSnapshotRequests = new AtomicInteger();
@@ -523,6 +531,106 @@ public abstract class 
InstallSnapshotNotificationTests<CLUSTER extends MiniRaftC
     runWithNewCluster(1, this::testInstallSnapshotDuringBootstrap);
   }
 
+  /**
+   * Issue setConfiguration in a bounded retry loop.  Each iteration uses a 
no-retry client so the
+   * outer loop controls when we log state, sleep, and finally fail with the 
collected diagnostics.
+   */
+  private void setConfigurationWithBoundedRetry(CLUSTER cluster, RaftPeer[] 
peersInNewConf, String phase)
+      throws IOException {
+    IOException lastException = null;
+    for (int attempt = 1; attempt <= SET_CONFIGURATION_MAX_ATTEMPTS; 
attempt++) {
+      final RaftPeerId leaderId;
+      try {
+        leaderId = RaftTestUtil.waitForLeader(cluster).getId();
+      } catch (InterruptedException e) {
+        Thread.currentThread().interrupt();
+        final InterruptedIOException ioe = new InterruptedIOException(
+            phase + ": interrupted while waiting for leader before 
setConfiguration");
+        ioe.initCause(e);
+        dumpClusterState(cluster, phase + ": setConfiguration", 
peersInNewConf, ioe);
+        throw ioe;
+      }
+      LOG.info("{}: setConfiguration attempt #{}/{}, leaderId={}, 
targetConf={}",
+          phase, attempt, SET_CONFIGURATION_MAX_ATTEMPTS, leaderId, 
Arrays.asList(peersInNewConf));
+      // Disable the client's internal retry loop here.  We want one 
observable RPC attempt per
+      // outer iteration so that CI logs show which attempt failed and what 
the cluster looked like.
+      try (final RaftClient client = cluster.createClient(leaderId, 
RetryPolicies.noRetry())) {
+        final RaftClientReply reply = 
client.admin().setConfiguration(peersInNewConf);
+        if (reply.isSuccess()) {
+          LOG.info("{}: setConfiguration succeeded on attempt #{}: {}", phase, 
attempt, reply);
+          return;
+        }
+        lastException = new IOException("setConfiguration returned 
unsuccessful reply: " + reply);
+      } catch (IOException e) {
+        lastException = e;
+      }
+
+      LOG.warn("{}: setConfiguration attempt #{} failed, targetConf={}, 
lastException={}, {}",
+          phase, attempt, Arrays.asList(peersInNewConf),
+          lastException == null ? null : lastException.toString(), 
cluster.printServers());
+      if (attempt < SET_CONFIGURATION_MAX_ATTEMPTS) {
+        try {
+          ONE_SECOND.sleep();
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+          final InterruptedIOException ioe = new InterruptedIOException(
+              phase + ": interrupted while sleeping before setConfiguration 
retry");
+          ioe.initCause(e);
+          dumpClusterState(cluster, phase + ": setConfiguration", 
peersInNewConf, ioe);
+          throw ioe;
+        }
+      }
+    }
+
+    dumpClusterState(cluster, phase + ": setConfiguration", peersInNewConf, 
lastException);
+    throw lastException != null? lastException
+        : new IOException("setConfiguration failed after " + 
SET_CONFIGURATION_MAX_ATTEMPTS + " attempts");
+  }
+
+  /**
+   * When the new configuration never becomes stable, dump the same 
diagnostics we use for
+   * setConfiguration failures so the CI log shows whether the request failed 
early or the cluster
+   * got stuck during convergence.
+   */
+  private void waitAndCheckNewConfWithDiagnostics(CLUSTER cluster, RaftPeer[] 
peersInNewConf, String phase)
+      throws Exception {
+    try {
+      LOG.info("{}: waitAndCheckNewConf, targetConf={}", phase, 
Arrays.asList(peersInNewConf));
+      RaftServerTestUtil.waitAndCheckNewConf(cluster, peersInNewConf, 0, null);
+    } catch (AssertionError | Exception e) {
+      dumpClusterState(cluster, phase + ": waitAndCheckNewConf", 
peersInNewConf, e);
+      throw e;
+    }
+  }
+
+  /**
+   * Print both the summarized cluster view and per-division indices/snapshot 
state so that a CI
+   * timeout can be traced to bootstrap, snapshot installation, or 
configuration-commit lag.
+   */
+  private void dumpClusterState(CLUSTER cluster, String phase, RaftPeer[] 
peersInNewConf, Throwable cause) {
+    final SnapshotInfo leaderSnapshotInfo = LEADER_SNAPSHOT_INFO_REF.get();
+    LOG.error("{} failed: targetConf={}, numSnapshotRequests={}, 
numNotifyInstallSnapshotFinished={}, "
+            + "leaderSnapshotInfo={}\n{}\n{}",
+        phase, Arrays.asList(peersInNewConf), numSnapshotRequests.get(), 
numNotifyInstallSnapshotFinished.get(),
+        
Optional.ofNullable(leaderSnapshotInfo).map(SnapshotInfo::getTermIndex).orElse(null),
+        cluster.printServers(), cluster.printAllLogs(), cause);
+
+    for (RaftServer.Division division : cluster.iterateDivisions()) {
+      final SnapshotInfo snapshot = 
division.getStateMachine().getLatestSnapshot();
+      LOG.error("{}: divisionState id={}, role={}, leaderId={}, 
leaderReady={}, alive={}, term={}, "
+              + "lastAppliedIndex={}, snapshot={}, logStartIndex={}, 
logNextIndex={}, commitIndex={}, conf={}, "
+              + "followerNextIndices={}, followerMatchIndices={}",
+          phase, division.getId(), division.getInfo().getCurrentRole(), 
division.getInfo().getLeaderId(),
+          division.getInfo().isLeaderReady(), division.getInfo().isAlive(), 
division.getInfo().getCurrentTerm(),
+          division.getInfo().getLastAppliedIndex(),
+          
Optional.ofNullable(snapshot).map(SnapshotInfo::getTermIndex).orElse(null),
+          division.getRaftLog().getStartIndex(), 
division.getRaftLog().getNextIndex(),
+          division.getRaftLog().getLastCommittedIndex(), 
division.getRaftConf(),
+          Arrays.toString(division.getInfo().getFollowerNextIndices()),
+          Arrays.toString(division.getInfo().getFollowerMatchIndices()));
+    }
+  }
+
   private void testInstallSnapshotDuringBootstrap(CLUSTER cluster) throws 
Exception {
     LEADER_SNAPSHOT_INFO_REF.set(null);
     numSnapshotRequests.set(0);
@@ -559,11 +667,12 @@ public abstract class 
InstallSnapshotNotificationTests<CLUSTER extends MiniRaftC
       // add two more peers
       final MiniRaftCluster.PeerChanges change = cluster.addNewPeers(2, true,
           true);
+      final String phase = "testInstallSnapshotDuringBootstrap";
       // trigger setConfiguration
       RaftServerTestUtil.runWithMinorityPeers(cluster, 
Arrays.asList(change.allPeersInNewConf),
-          peers -> 
cluster.setConfiguration(peers.toArray(RaftPeer.emptyArray())));
+          peers -> setConfigurationWithBoundedRetry(cluster, 
peers.toArray(RaftPeer.emptyArray()), phase));
 
-      RaftServerTestUtil.waitAndCheckNewConf(cluster, 
change.allPeersInNewConf, 0, null);
+      waitAndCheckNewConfWithDiagnostics(cluster, change.allPeersInNewConf, 
phase);
 
       // Check the installed snapshot index on each Follower matches with the
       // leader snapshot.

(ratis) branch RATIS-1931_grpc-zero-copy updated: RATIS-2501. Improve diagnostics for testInstallSnapshotDuringBootstrap timeout failures. (#1427)

Reply via email to