This is an automated email from the ASF dual-hosted git repository.
tejaskriya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new a5e1cd0a692 HDDS-13533. Show the summary of replicas verify checks
(#8898)
a5e1cd0a692 is described below
commit a5e1cd0a6923d965441b1e69be9744099e045e56
Author: Sarveksha Yeshavantha Raju
<[email protected]>
AuthorDate: Wed Sep 3 16:49:41 2025 +0530
HDDS-13533. Show the summary of replicas verify checks (#8898)
---
.../smoketest/debug/ozone-debug-keywords.robot | 3 +-
.../ozone/debug/replicas/ReplicasVerify.java | 172 +++++++++++++++++++--
2 files changed, 163 insertions(+), 12 deletions(-)
diff --git
a/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot
b/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot
index a6bed4524d2..d75bdd20607 100644
--- a/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot
+++ b/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot
@@ -36,7 +36,8 @@ Execute replicas verify container state debug tool
Parse replicas verify JSON output
[Arguments] ${output}
- ${json} = Evaluate json.loads('''${output}''') json
+ ${json_split} = Evaluate '''${output}'''.split('***')[0].strip()
+ ${json} = Evaluate json.loads('''${json_split}''') json
[Return] ${json}
Check to Verify Replicas
diff --git
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
index aeaa689fcde..4dc810be6b5 100644
---
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
+++
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
@@ -17,14 +17,25 @@
package org.apache.hadoop.ozone.debug.replicas;
+import static
org.apache.hadoop.ozone.conf.OzoneServiceConfig.DEFAULT_SHUTDOWN_HOOK_PRIORITY;
+
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.io.IOException;
+import java.io.PrintStream;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Supplier;
+import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.hadoop.hdds.client.ReplicationConfig;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.scm.cli.ScmOption;
@@ -41,6 +52,7 @@
import org.apache.hadoop.ozone.shell.OzoneAddress;
import org.apache.hadoop.ozone.shell.Shell;
import org.apache.hadoop.ozone.shell.ShellReplicationOptions;
+import org.apache.hadoop.ozone.util.ShutdownHookManager;
import picocli.CommandLine;
/**
@@ -79,22 +91,80 @@ public class ReplicasVerify extends Handler {
private List<ReplicaVerifier> replicaVerifiers;
+ private static final String DURATION_FORMAT = "HH:mm:ss,SSS";
+ private long startTime;
+ private long endTime;
+ private String verificationScope;
+ private final List<String> verificationTypes = new ArrayList<>();
+ private final AtomicInteger volumesProcessed = new AtomicInteger(0);
+ private final AtomicInteger bucketsProcessed = new AtomicInteger(0);
+ private final AtomicInteger keysProcessed = new AtomicInteger(0);
+ private final AtomicInteger keysPassed = new AtomicInteger(0);
+ private final AtomicInteger keysFailed = new AtomicInteger(0);
+ private final Map<String, AtomicInteger> failuresByType = new
ConcurrentHashMap<>();
+ private volatile Throwable exception;
+
+ private void addVerifier(boolean condition, Supplier<ReplicaVerifier>
verifierSupplier) {
+ if (condition) {
+ ReplicaVerifier verifier = verifierSupplier.get();
+ replicaVerifiers.add(verifier);
+ String verifierType = verifier.getType();
+ verificationTypes.add(verifierType);
+ failuresByType.put(verifierType, new AtomicInteger(0));
+ }
+ }
+
@Override
protected void execute(OzoneClient client, OzoneAddress address) throws
IOException {
+ startTime = System.nanoTime();
+
+ if (!address.getKeyName().isEmpty()) {
+ verificationScope = "Key";
+ } else if (!address.getBucketName().isEmpty()) {
+ verificationScope = "Bucket";
+ } else if (!address.getVolumeName().isEmpty()) {
+ verificationScope = "Volume";
+ } else {
+ verificationScope = "All Volumes";
+ }
+
replicaVerifiers = new ArrayList<>();
- if (verification.doExecuteChecksums) {
- replicaVerifiers.add(new ChecksumVerifier(getConf()));
- }
+ addVerifier(verification.doExecuteChecksums, () -> {
+ try {
+ return new ChecksumVerifier(getConf());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ });
- if (verification.doExecuteBlockExistence) {
- replicaVerifiers.add(new BlockExistenceVerifier(getConf()));
- }
- if (verification.doExecuteReplicaState) {
- replicaVerifiers.add(new ContainerStateVerifier(getConf(),
containerCacheSize));
- }
+ addVerifier(verification.doExecuteBlockExistence, () -> {
+ try {
+ return new BlockExistenceVerifier(getConf());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ });
- findCandidateKeys(client, address);
+ addVerifier(verification.doExecuteReplicaState, () -> {
+ try {
+ return new ContainerStateVerifier(getConf(), containerCacheSize);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ });
+
+ // Add shutdown hook to ensure summary is printed even if interrupted
+ addShutdownHook();
+
+ try {
+ findCandidateKeys(client, address);
+ } catch (Exception e) {
+ exception = e;
+ throw e;
+ } finally {
+ endTime = System.nanoTime();
+ }
}
@Override
@@ -133,6 +203,7 @@ void findCandidateKeys(OzoneClient ozoneClient,
OzoneAddress address) throws IOE
void checkVolume(OzoneClient ozoneClient, OzoneVolume volume, ArrayNode
keysArray, AtomicBoolean allKeysPassed)
throws IOException {
+ volumesProcessed.incrementAndGet();
for (Iterator<? extends OzoneBucket> it = volume.listBuckets(null);
it.hasNext();) {
OzoneBucket bucket = it.next();
checkBucket(ozoneClient, bucket, keysArray, allKeysPassed);
@@ -141,6 +212,7 @@ void checkVolume(OzoneClient ozoneClient, OzoneVolume
volume, ArrayNode keysArra
void checkBucket(OzoneClient ozoneClient, OzoneBucket bucket, ArrayNode
keysArray, AtomicBoolean allKeysPassed)
throws IOException {
+ bucketsProcessed.incrementAndGet();
for (Iterator<? extends OzoneKey> it = bucket.listKeys(null);
it.hasNext();) {
OzoneKey key = it.next();
// TODO: Remove this check once HDDS-12094 is fixed
@@ -152,6 +224,7 @@ void checkBucket(OzoneClient ozoneClient, OzoneBucket
bucket, ArrayNode keysArra
void processKey(OzoneClient ozoneClient, String volumeName, String
bucketName, String keyName,
ArrayNode keysArray, AtomicBoolean allKeysPassed) throws IOException {
+ keysProcessed.incrementAndGet();
OmKeyInfo keyInfo = ozoneClient.getProxy().getKeyInfo(
volumeName, bucketName, keyName, false);
@@ -167,6 +240,7 @@ void processKey(OzoneClient ozoneClient, String volumeName,
String bucketName, S
ArrayNode blocksArray = keyNode.putArray("blocks");
boolean keyPass = true;
+ Set<String> failedVerificationTypes = new HashSet<>();
for (OmKeyLocationInfo keyLocation :
keyInfo.getLatestVersionLocations().getBlocksLatestVersionOnly()) {
long containerID = keyLocation.getContainerID();
@@ -205,6 +279,7 @@ void processKey(OzoneClient ozoneClient, String volumeName,
String bucketName, S
if (!result.passed()) {
replicaPass = false;
+ failedVerificationTypes.add(verifier.getType());
}
}
@@ -219,8 +294,15 @@ void processKey(OzoneClient ozoneClient, String
volumeName, String bucketName, S
}
keyNode.put("pass", keyPass);
- if (!keyPass) {
+ if (keyPass) {
+ keysPassed.incrementAndGet();
+ } else {
+ keysFailed.incrementAndGet();
allKeysPassed.set(false);
+ failedVerificationTypes.forEach(failedType -> failuresByType
+ .computeIfAbsent(failedType, k -> new AtomicInteger(0))
+ .incrementAndGet()
+ );
}
if (!keyPass || allResults) {
@@ -228,6 +310,74 @@ void processKey(OzoneClient ozoneClient, String
volumeName, String bucketName, S
}
}
+ /**
+ * Adds ShutdownHook to print summary statistics.
+ */
+ private void addShutdownHook() {
+ ShutdownHookManager.get().addShutdownHook(() -> {
+ if (endTime == 0) {
+ endTime = System.nanoTime();
+ }
+ printSummary(System.err);
+ }, DEFAULT_SHUTDOWN_HOOK_PRIORITY);
+ }
+
+ /**
+ * Prints summary of replica verification run.
+ *
+ * @param out PrintStream
+ */
+ void printSummary(PrintStream out) {
+ if (endTime == 0) {
+ endTime = System.nanoTime();
+ }
+
+ long execTimeNanos = endTime - startTime;
+ String execTime =
DurationFormatUtils.formatDuration(TimeUnit.NANOSECONDS.toMillis(execTimeNanos),
DURATION_FORMAT);
+
+ long totalKeysProcessed = keysProcessed.get();
+ long totalKeysPassed = keysPassed.get();
+ long totalKeysFailed = keysFailed.get();
+
+ out.println();
+ out.println("***************************************************");
+ out.println("REPLICA VERIFICATION SUMMARY");
+ out.println("***************************************************");
+ out.println("Status: " + (exception != null ? "Failed" :
+ (totalKeysFailed == 0 ? "Success" : "Completed with failures")));
+ out.println("Verification Scope: " + verificationScope);
+ out.println("Verification Types: " + String.join(", ", verificationTypes));
+ out.println("URI: " + uri);
+ out.println();
+ out.println("Number of Volumes processed: " + volumesProcessed.get());
+ out.println("Number of Buckets processed: " + bucketsProcessed.get());
+ out.println("Number of Keys processed: " + totalKeysProcessed);
+ out.println();
+ out.println("Keys passed verification: " + totalKeysPassed);
+ out.println("Keys failed verification: " + totalKeysFailed);
+
+ if (!failuresByType.isEmpty() && totalKeysFailed > 0) {
+ out.println();
+ for (String verificationType : verificationTypes) {
+ long typeFailures = failuresByType.get(verificationType).get();
+ if (typeFailures > 0) {
+ out.println("Keys failed " + verificationType + " verification: " +
typeFailures);
+ }
+ }
+ out.println("Note: A key may fail multiple verification types, so total
may exceed overall failures.");
+ }
+
+ out.println();
+ out.println("Total Execution time: " + execTime);
+
+ if (exception != null) {
+ out.println();
+ out.println("Exception: " + exception.getClass().getSimpleName() + ": "
+ exception.getMessage());
+ }
+
+ out.println("***************************************************");
+ }
+
/**
* Check if the key should be processed based on replication config.
* @param keyInfo the key to check
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]