This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 9036e1ab6e8 HDDS-13535. Show under/over-replication in `replicas
verify --container-state` results (#9135)
9036e1ab6e8 is described below
commit 9036e1ab6e87f5fb848bed2a29cc7d4f5ba51897
Author: ChenChen Lai <[email protected]>
AuthorDate: Tue Dec 9 18:42:57 2025 +0800
HDDS-13535. Show under/over-replication in `replicas verify
--container-state` results (#9135)
---
.../smoketest/debug/ozone-debug-keywords.robot | 20 ++++
.../debug/replicas/ContainerStateVerifier.java | 105 ++++++++++++++++-----
2 files changed, 102 insertions(+), 23 deletions(-)
diff --git
a/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot
b/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot
index d75bdd20607..aa51febb318 100644
--- a/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot
+++ b/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot
@@ -43,6 +43,11 @@ Parse replicas verify JSON output
Check to Verify Replicas
[Arguments] ${json} ${check_type} ${faulty_datanode}
${expected_message}
${replicas} = Get From Dictionary ${json['keys'][0]['blocks'][0]}
replicas
+ Run Keyword If '${check_type}' == 'containerState' Check Container
State Replicas ${replicas} ${faulty_datanode} ${expected_message}
+ ... ELSE Check Standard Replicas ${replicas} ${check_type}
${faulty_datanode} ${expected_message}
+
+Check Standard Replicas
+ [Arguments] ${replicas} ${check_type} ${faulty_datanode}
${expected_message}
FOR ${replica} IN @{replicas}
${datanode} = Get From Dictionary ${replica} datanode
${hostname} = Get From Dictionary ${datanode} hostname
@@ -50,6 +55,21 @@ Check to Verify Replicas
Run Keyword If '${hostname}' != '${faulty_datanode}' Check
Replica Passed ${replica} ${check_type}
END
+Check Container State Replicas
+ [Arguments] ${replicas} ${faulty_datanode} ${expected_message}
+ FOR ${replica} IN @{replicas}
+ ${datanode} = Get From Dictionary ${replica} datanode
+ ${hostname} = Get From Dictionary ${datanode} hostname
+ ${checks} = Get From Dictionary ${replica} checks
+ ${check} = Get From List ${checks} 0
+ Should Be Equal ${check['type']} containerState
+ Should Be Equal ${check['pass']} ${False}
+ ${actual_message} = Set Variable
${check['failures'][0]['message']}
+
+ Run Keyword If '${hostname}' == '${faulty_datanode}' Should
Contain ${actual_message} ${expected_message}
+ ... ELSE Should Match Regexp ${actual_message} Replica
state is (OPEN|CLOSING|QUASI_CLOSED|CLOSED)
+ END
+
Check Replica Failed
[Arguments] ${replica} ${check_type} ${expected_message}
${checks} = Get From Dictionary ${replica} checks
diff --git
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
index 465f1e99c94..0ed4cb10898 100644
---
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
+++
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
@@ -21,6 +21,7 @@
import com.google.common.cache.CacheBuilder;
import java.io.IOException;
import java.util.EnumSet;
+import java.util.List;
import java.util.Objects;
import java.util.Set;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
@@ -32,6 +33,8 @@
import org.apache.hadoop.hdds.scm.XceiverClientSpi;
import org.apache.hadoop.hdds.scm.cli.ContainerOperationClient;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerReplicaInfo;
+import org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls;
import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
@@ -45,8 +48,8 @@ public class ContainerStateVerifier implements
ReplicaVerifier {
private static final long DEFAULT_CONTAINER_CACHE_SIZE = 1000000;
private final ContainerOperationClient containerOperationClient;
private final XceiverClientManager xceiverClientManager;
- // cache for container info and encodedToken from the SCM
- private final Cache<Long, ContainerInfoToken> encodedTokenCache;
+ // cache for information about the container from SCM
+ private final Cache<Long, ContainerInformation> containerCache;
private static final Set<ContainerDataProto.State> GOOD_REPLICA_STATES =
EnumSet.of(
@@ -73,7 +76,9 @@ public ContainerStateVerifier(OzoneConfiguration conf, long
containerCacheSize)
". Falling back to default: " + DEFAULT_CONTAINER_CACHE_SIZE);
containerCacheSize = DEFAULT_CONTAINER_CACHE_SIZE;
}
- encodedTokenCache =
CacheBuilder.newBuilder().maximumSize(containerCacheSize).build();
+ containerCache = CacheBuilder.newBuilder()
+ .maximumSize(containerCacheSize)
+ .build();
}
@Override
@@ -87,19 +92,29 @@ public BlockVerificationResult verifyBlock(DatanodeDetails
datanode, OmKeyLocati
StringBuilder replicaCheckMsg = new StringBuilder().append("Replica
state is ");
boolean pass = false;
- ContainerInfoToken containerInfoToken =
getContainerInfoToken(keyLocation.getContainerID());
- ContainerDataProto containerData =
fetchContainerDataFromDatanode(datanode, keyLocation.getContainerID(),
- keyLocation, containerInfoToken);
+ long containerID = keyLocation.getContainerID();
+ ContainerInformation containerInformation =
fetchContainerInformationFromSCM(containerID);
+ ContainerDataProto containerData =
fetchContainerDataFromDatanode(datanode, containerID,
+ keyLocation, containerInformation.getEncodedToken());
if (containerData == null) {
return BlockVerificationResult.failIncomplete("No container data
returned from DN.");
}
ContainerDataProto.State state = containerData.getState();
replicaCheckMsg.append(state.name());
- if (areContainerAndReplicasInGoodState(state,
containerInfoToken.getContainerState())) {
+ boolean replicaStateGood = areContainerAndReplicasInGoodState(state,
containerInformation.getContainerState());
+ replicaCheckMsg.append(", Container state in SCM is
").append(containerInformation.getContainerState());
+
+ String replicationStatus = containerInformation.getReplicationStatus();
+ replicaCheckMsg.append(", ").append(replicationStatus);
+
+ // Replication status check evaluates container-level health by counting
healthy replicas
+ // across all datanodes. Therefore, when a container is UNDER_REPLICATED
or OVER_REPLICATED,
+ // this information should be reflected in all replica outputs, not just
the unhealthy ones.
+ if
(replicationStatus.startsWith(ContainerHealthResult.HealthState.HEALTHY.name())
+ && replicaStateGood) {
pass = true;
}
- replicaCheckMsg.append(", Container state in SCM is
").append(containerInfoToken.getContainerState());
if (pass) {
return BlockVerificationResult.pass();
@@ -123,13 +138,12 @@ private boolean
areContainerAndReplicasInGoodState(ContainerDataProto.State repl
private ContainerDataProto fetchContainerDataFromDatanode(DatanodeDetails
dn, long containerId,
OmKeyLocationInfo
keyLocation,
- ContainerInfoToken
containerInfoToken)
+ String
encodedToken)
throws IOException {
XceiverClientSpi client = null;
ReadContainerResponseProto response;
try {
Pipeline pipeline = keyLocation.getPipeline().copyForReadFromNode(dn);
- String encodedToken = containerInfoToken.getEncodedToken();
client = xceiverClientManager.acquireClientForReadData(pipeline);
response = ContainerProtocolCalls
@@ -146,27 +160,67 @@ private ContainerDataProto
fetchContainerDataFromDatanode(DatanodeDetails dn, lo
return response.getContainerData();
}
- private ContainerInfoToken getContainerInfoToken(long containerId)
+ private ContainerInformation fetchContainerInformationFromSCM(long
containerId)
throws IOException {
- ContainerInfoToken cachedData =
encodedTokenCache.getIfPresent(containerId);
+ ContainerInformation cachedData = containerCache.getIfPresent(containerId);
if (cachedData != null) {
return cachedData;
}
- // Cache miss - fetch and store
- ContainerInfo info = containerOperationClient.getContainer(containerId);
+ // Cache miss - fetch container info, token, and compute replication status
+ ContainerInfo containerInfo =
containerOperationClient.getContainer(containerId);
String encodeToken =
containerOperationClient.getEncodedContainerToken(containerId);
- cachedData = new ContainerInfoToken(info.getState(), encodeToken);
- encodedTokenCache.put(containerId, cachedData);
+ String replicationStatus = computeReplicationStatus(containerId,
containerInfo);
+ cachedData = new ContainerInformation(containerInfo.getState(),
encodeToken, replicationStatus);
+ containerCache.put(containerId, cachedData);
return cachedData;
}
- private static class ContainerInfoToken {
- private HddsProtos.LifeCycleState state;
+ private String computeReplicationStatus(long containerId, ContainerInfo
containerInfo) {
+ try {
+ List<ContainerReplicaInfo> replicaInfos =
+ containerOperationClient.getContainerReplicas(containerId);
+
+ if (replicaInfos.isEmpty()) {
+ return ContainerHealthResult.HealthState.UNDER_REPLICATED
+ + ": no replicas found";
+ }
+
+ int requiredNodes =
+ containerInfo.getReplicationConfig().getRequiredNodes();
+ int healthyReplicas = 0;
+
+ for (ContainerReplicaInfo replicaInfo : replicaInfos) {
+ if (!"UNHEALTHY".equals(replicaInfo.getState())) {
+ healthyReplicas++;
+ }
+ }
+
+ if (healthyReplicas == requiredNodes) {
+ return ContainerHealthResult.HealthState.HEALTHY.toString();
+ }
+
+ ContainerHealthResult.HealthState status =
+ healthyReplicas < requiredNodes
+ ? ContainerHealthResult.HealthState.UNDER_REPLICATED
+ : ContainerHealthResult.HealthState.OVER_REPLICATED;
+
+ return String.format("%s: %d/%d healthy replicas",
+ status, healthyReplicas, requiredNodes);
+ } catch (Exception e) {
+ return "REPLICATION_CHECK_FAILED: " + e.getMessage();
+ }
+ }
+
+ /** Information from SCM about the container needed for each replica. */
+ private static class ContainerInformation {
+ private final HddsProtos.LifeCycleState state;
private final String encodedToken;
+ private final String replicationStatus;
- ContainerInfoToken(HddsProtos.LifeCycleState lifeState, String token) {
+ ContainerInformation(HddsProtos.LifeCycleState lifeState, String token,
String replicationStatus) {
this.state = lifeState;
this.encodedToken = token;
+ this.replicationStatus = replicationStatus;
}
@Override
@@ -174,17 +228,18 @@ public boolean equals(Object o) {
if (this == o) {
return true;
}
- if (!(o instanceof ContainerInfoToken)) {
+ if (!(o instanceof ContainerInformation)) {
return false;
}
- ContainerInfoToken key = (ContainerInfoToken) o;
+ ContainerInformation key = (ContainerInformation) o;
return Objects.equals(state, key.state) &&
- Objects.equals(encodedToken, key.encodedToken);
+ Objects.equals(encodedToken, key.encodedToken) &&
+ Objects.equals(replicationStatus, key.replicationStatus);
}
@Override
public int hashCode() {
- return Objects.hash(state, encodedToken);
+ return Objects.hash(state, encodedToken, replicationStatus);
}
public HddsProtos.LifeCycleState getContainerState() {
@@ -194,6 +249,10 @@ public HddsProtos.LifeCycleState getContainerState() {
public String getEncodedToken() {
return encodedToken;
}
+
+ public String getReplicationStatus() {
+ return replicationStatus;
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]