This is an automated email from the ASF dual-hosted git repository.
sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 0fda8b28859 HDDS-13544. DN Decommission Fails When Other Datanodes Are
Offline Due to Invalid Affinity Node in Ratis Replication (#8934)
0fda8b28859 is described below
commit 0fda8b28859855090fd27665edf92d36e84b7f70
Author: Siddhant Sangwan <[email protected]>
AuthorDate: Tue Sep 30 00:57:03 2025 +0530
HDDS-13544. DN Decommission Fails When Other Datanodes Are Offline Due to
Invalid Affinity Node in Ratis Replication (#8934)
---
.../replication/ReplicationManagerUtil.java | 25 ++++-
.../replication/TestReplicationManagerUtil.java | 12 +++
.../TestReplicationManagerIntegration.java | 108 ++++++++++++++++++++-
3 files changed, 139 insertions(+), 6 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerUtil.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerUtil.java
index 8b2ad758242..b08bdafd788 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerUtil.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerUtil.java
@@ -55,6 +55,15 @@ public final class ReplicationManagerUtil {
private ReplicationManagerUtil() {
}
+ private static String formatDatanodeDetails(List<DatanodeDetails> dns) {
+ if (dns == null) {
+ return "[]";
+ }
+ return dns.stream()
+ .map(dn -> String.format("%s[%s]", dn, dn.getPersistedOpState()))
+ .collect(Collectors.toList()).toString();
+ }
+
/**
* Using the passed placement policy attempt to select a list of datanodes to
* use as new targets. If the placement policy is unable to select enough
@@ -102,8 +111,9 @@ public static List<DatanodeDetails>
getTargetDatanodes(PlacementPolicy policy,
}
}
throw new SCMException(String.format("Placement Policy: %s did not return"
- + " any nodes. Number of required Nodes %d, Data size Required:
%d",
- policy.getClass(), requiredNodes, dataSizeRequired),
+ + " any nodes. Number of required Nodes %d, Data size Required:
%d. Container: %s, Used Nodes %s, " +
+ "Excluded Nodes: %s.", policy.getClass(), requiredNodes,
dataSizeRequired, container,
+ formatDatanodeDetails(usedNodes),
formatDatanodeDetails(excludedNodes)),
SCMException.ResultCodes.FAILED_TO_FIND_SUITABLE_NODE);
}
@@ -178,6 +188,17 @@ An UNHEALTHY replica with unique origin node id of a
QUASI_CLOSED container shou
excludedNodes.add(r.getDatanodeDetails());
continue;
}
+ if (nodeStatus.isMaintenance() && nodeStatus.isDead()) {
+ // Dead maintenance nodes are removed from the network topology, so
the topology logic can't find
+ // out their location and hence can't consider them for figuring out
rack placement. So, we don't add them
+ // to the used nodes list. We also don't add them to excluded nodes,
as the placement policy logic won't
+ // consider a node that's not in the topology anyway. In fact,
adding it to excluded nodes will cause a
+ // problem if total nodes (in topology) + required nodes becomes
less than excluded + used nodes.
+
+ // TODO: In the future, can the policy logic be changed to use the
DatanodeDetails network location to figure
+ // out the rack?
+ continue;
+ }
} catch (NodeNotFoundException e) {
LOG.warn("Node {} not found in node manager.", r.getDatanodeDetails());
// This should not happen, but if it does, just add the node to the
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerUtil.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerUtil.java
index a7718bdf24c..a3573dc2e47 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerUtil.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerUtil.java
@@ -25,6 +25,7 @@
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerReplica;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.mockito.Mockito.any;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.mock;
@@ -97,6 +98,12 @@ public void testGetExcludedAndUsedNodes() throws
NodeNotFoundException {
IN_MAINTENANCE, ContainerReplicaProto.State.CLOSED, 1);
replicas.add(maintenance);
+ // dead maintenance node should neither be on the used list nor on the
excluded list
+ ContainerReplica deadMaintenanceReplica = createContainerReplica(cid, 0,
+ IN_MAINTENANCE, ContainerReplicaProto.State.CLOSED, 1);
+ DatanodeDetails deadMaintenanceNode =
deadMaintenanceReplica.getDatanodeDetails();
+ replicas.add(deadMaintenanceReplica);
+
// Take one of the replicas and set it to be removed. It should be on the
// excluded list rather than the used list.
Set<ContainerReplica> toBeRemoved = new HashSet<>();
@@ -115,6 +122,9 @@ public void testGetExcludedAndUsedNodes() throws
NodeNotFoundException {
when(replicationManager.getNodeStatus(any())).thenAnswer(
invocation -> {
final DatanodeDetails dn = invocation.getArgument(0);
+ if (dn.equals(deadMaintenanceNode)) {
+ return NodeStatus.valueOf(dn.getPersistedOpState(),
HddsProtos.NodeState.DEAD);
+ }
for (ContainerReplica r : replicas) {
if (r.getDatanodeDetails().equals(dn)) {
return NodeStatus.valueOf(
@@ -137,6 +147,7 @@ public void testGetExcludedAndUsedNodes() throws
NodeNotFoundException {
.contains(maintenance.getDatanodeDetails());
assertThat(excludedAndUsedNodes.getUsedNodes())
.contains(pendingAdd);
+
assertFalse(excludedAndUsedNodes.getUsedNodes().contains(deadMaintenanceNode));
assertEquals(4, excludedAndUsedNodes.getExcludedNodes().size());
assertThat(excludedAndUsedNodes.getExcludedNodes())
@@ -147,6 +158,7 @@ public void testGetExcludedAndUsedNodes() throws
NodeNotFoundException {
.contains(remove.getDatanodeDetails());
assertThat(excludedAndUsedNodes.getExcludedNodes())
.contains(pendingDelete);
+
assertFalse(excludedAndUsedNodes.getExcludedNodes().contains(deadMaintenanceNode));
}
@Test
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerIntegration.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerIntegration.java
index 8556a01e533..21abf3f7772 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerIntegration.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerIntegration.java
@@ -25,6 +25,7 @@
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_NODE_REPORT_INTERVAL;
import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_PIPELINE_REPORT_INTERVAL;
import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT;
+import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.DECOMMISSIONED;
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_MAINTENANCE;
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DEAD;
@@ -40,6 +41,7 @@
import static
org.apache.hadoop.hdds.scm.node.TestNodeUtil.waitForDnToReachPersistedOpState;
import static
org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_SCM_CLOSE_CONTAINER_WAIT_DURATION;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
@@ -47,6 +49,7 @@
import java.util.List;
import java.util.Set;
import java.util.UUID;
+import java.util.stream.Collectors;
import org.apache.hadoop.hdds.client.RatisReplicationConfig;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
@@ -59,6 +62,7 @@
import org.apache.hadoop.hdds.scm.container.ContainerManager;
import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
+import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
import
org.apache.hadoop.hdds.scm.container.replication.ReplicationManager.ReplicationManagerConfiguration;
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
@@ -71,8 +75,8 @@
import org.apache.hadoop.ozone.client.OzoneKeyDetails;
import org.apache.hadoop.ozone.client.OzoneKeyLocation;
import org.apache.ozone.test.GenericTestUtils;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.MethodOrderer;
import org.junit.jupiter.api.Order;
import org.junit.jupiter.api.Test;
@@ -104,7 +108,7 @@ class TestReplicationManagerIntegration {
private ContainerOperationClient scmClient;
private OzoneBucket bucket;
- @BeforeAll
+ @BeforeEach
void init() throws Exception {
OzoneConfiguration conf = new OzoneConfiguration();
@@ -152,7 +156,7 @@ void init() throws Exception {
bucket = TestDataUtil.createVolumeAndBucket(client);
}
- @AfterAll
+ @AfterEach
void shutdown() {
IOUtils.close(LOG, client, scmClient, cluster);
}
@@ -260,4 +264,100 @@ void
testClosedContainerReplicationWhenNodeDecommissionAndBackToInService(
}
}, 200, 30000);
}
+
+ /**
+ * A node containing a replica of a Ratis container is put into maintenance
with no expiry. When it is maintenance, it
+ * is shutdown and the test waits until it's handled as dead. Then another
node containing this container's replica is
+ * decommissioned. The expectation is that it should successfully
decommission.
+ */
+ @Test
+ public void testDeadMaintenanceNodeAndDecommission() throws Exception {
+ String keyName = "key-" + UUID.randomUUID();
+ TestDataUtil.createKey(bucket, keyName, RATIS_REPLICATION_CONFIG,
+ "this is the content".getBytes(StandardCharsets.UTF_8));
+
+ OzoneKeyDetails key = bucket.getKey(keyName);
+ List<OzoneKeyLocation> keyLocations = key.getOzoneKeyLocations();
+
+ long iD = keyLocations.get(0).getContainerID();
+ ContainerID containerId = ContainerID.valueOf(iD);
+ ContainerInfo containerInfo = containerManager.getContainer(containerId);
+ OzoneTestUtils.closeContainer(scm, containerInfo);
+
+ assertEquals(HEALTHY_REPLICA_NUM,
+ containerManager.getContainerReplicas(containerId).size());
+
+ List<DatanodeDetails> dns = containerManager
+ .getContainerReplicas(containerId)
+ .stream().map(ContainerReplica::getDatanodeDetails)
+ .collect(Collectors.toList());
+
+ DatanodeDetails maintenanceDn = dns.get(0);
+ DatanodeDetails decomDn = dns.get(1);
+
+
scmClient.startMaintenanceNodes(Collections.singletonList(getDNHostAndPort(maintenanceDn)),
0, false);
+ waitForDnToReachOpState(nodeManager, maintenanceDn, IN_MAINTENANCE);
+ cluster.shutdownHddsDatanode(maintenanceDn);
+ waitForDnToReachHealthState(nodeManager, maintenanceDn, DEAD);
+
+ ContainerReplicaCount containerReplicaCount =
replicationManager.getContainerReplicaCount(containerId);
+ assertTrue(containerReplicaCount.isSufficientlyReplicated());
+
+
scmClient.decommissionNodes(Collections.singletonList(getDNHostAndPort(decomDn)),
false);
+ waitForDnToReachOpState(nodeManager, decomDn, DECOMMISSIONED);
+
+ assertEquals(HEALTHY_REPLICA_NUM + 1,
containerManager.getContainerReplicas(containerId).size());
+ ReplicationManagerReport report = new ReplicationManagerReport();
+ replicationManager.checkContainerStatus(containerInfo, report);
+ assertEquals(0,
report.getStat(ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+ assertEquals(0,
report.getStat(ReplicationManagerReport.HealthState.MIS_REPLICATED));
+ assertEquals(0,
report.getStat(ReplicationManagerReport.HealthState.OVER_REPLICATED));
+ }
+
+ @Test
+ public void
testOneDeadMaintenanceNodeAndOneLiveMaintenanceNodeAndOneDecommissionNode()
throws Exception {
+ String keyName = "key-" + UUID.randomUUID();
+ TestDataUtil.createKey(bucket, keyName, RATIS_REPLICATION_CONFIG,
+ "this is the content".getBytes(StandardCharsets.UTF_8));
+
+ OzoneKeyDetails key = bucket.getKey(keyName);
+ List<OzoneKeyLocation> keyLocations = key.getOzoneKeyLocations();
+
+ long iD = keyLocations.get(0).getContainerID();
+ ContainerID containerId = ContainerID.valueOf(iD);
+ ContainerInfo containerInfo = containerManager.getContainer(containerId);
+ OzoneTestUtils.closeContainer(scm, containerInfo);
+
+ assertEquals(HEALTHY_REPLICA_NUM,
+ containerManager.getContainerReplicas(containerId).size());
+
+ List<DatanodeDetails> dns = containerManager
+ .getContainerReplicas(containerId)
+ .stream().map(ContainerReplica::getDatanodeDetails)
+ .collect(Collectors.toList());
+
+ DatanodeDetails maintenanceDn = dns.get(0);
+ DatanodeDetails decomDn = dns.get(1);
+ DatanodeDetails secondMaintenanceDn = dns.get(2);
+
+
scmClient.startMaintenanceNodes(Collections.singletonList(getDNHostAndPort(maintenanceDn)),
0, false);
+ waitForDnToReachOpState(nodeManager, maintenanceDn, IN_MAINTENANCE);
+ cluster.shutdownHddsDatanode(maintenanceDn);
+ waitForDnToReachHealthState(nodeManager, maintenanceDn, DEAD);
+
+ ContainerReplicaCount containerReplicaCount =
replicationManager.getContainerReplicaCount(containerId);
+ assertTrue(containerReplicaCount.isSufficientlyReplicated());
+
+
scmClient.decommissionNodes(Collections.singletonList(getDNHostAndPort(decomDn)),
false);
+
scmClient.startMaintenanceNodes(Collections.singletonList(getDNHostAndPort(secondMaintenanceDn)),
0, false);
+ waitForDnToReachOpState(nodeManager, decomDn, DECOMMISSIONED);
+ waitForDnToReachOpState(nodeManager, secondMaintenanceDn, IN_MAINTENANCE);
+
+ assertEquals(HEALTHY_REPLICA_NUM + 2,
containerManager.getContainerReplicas(containerId).size());
+ ReplicationManagerReport report = new ReplicationManagerReport();
+ replicationManager.checkContainerStatus(containerInfo, report);
+ assertEquals(0,
report.getStat(ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+ assertEquals(0,
report.getStat(ReplicationManagerReport.HealthState.MIS_REPLICATED));
+ assertEquals(0,
report.getStat(ReplicationManagerReport.HealthState.OVER_REPLICATED));
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]