This is an automated email from the ASF dual-hosted git repository.

sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new a2bcd5815ed HDDS-13762. Mis replication bug when there is a dead 
maintenance node (#9156)
a2bcd5815ed is described below

commit a2bcd5815edde240f0627b7e2fc3ea969434c164
Author: Siddhant Sangwan <[email protected]>
AuthorDate: Fri Oct 17 16:02:54 2025 +0530

    HDDS-13762. Mis replication bug when there is a dead maintenance node 
(#9156)
---
 .../hadoop/hdds/scm/SCMCommonPlacementPolicy.java  | 23 +++++++++-
 .../hdds/scm/TestSCMCommonPlacementPolicy.java     | 52 +++++++++++++++++++++-
 2 files changed, 73 insertions(+), 2 deletions(-)

diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/SCMCommonPlacementPolicy.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/SCMCommonPlacementPolicy.java
index 0a0f6d93c29..934e13bb53b 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/SCMCommonPlacementPolicy.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/SCMCommonPlacementPolicy.java
@@ -46,6 +46,7 @@
 import org.apache.hadoop.hdds.scm.node.DatanodeInfo;
 import org.apache.hadoop.hdds.scm.node.NodeManager;
 import org.apache.hadoop.hdds.scm.node.NodeStatus;
+import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
 import org.apache.hadoop.ozone.container.common.volume.VolumeUsage;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -445,7 +446,27 @@ public ContainerPlacementStatus validateContainerPlacement(
       }
     }
     List<Integer> currentRackCount = new ArrayList<>(dns.stream()
-        .map(this::getPlacementGroup)
+        .map(dn -> {
+          Node rack = getPlacementGroup(dn);
+          if (rack == null) {
+            try {
+              NodeStatus nodeStatus = nodeManager.getNodeStatus(dn);
+              if (nodeStatus.isDead() && nodeStatus.isMaintenance()) {
+                LOG.debug("Using rack [{}] for dead and in-maintenance dn 
{}.", dn.getNetworkLocation(), dn);
+                return dn.getNetworkLocation();
+              }
+              return null;
+            } catch (NodeNotFoundException e) {
+              LOG.debug("Could not get NodeStatus for dn {}.", dn, e);
+              return null;
+            }
+          }
+          /*
+          data-centre/rack1/dn1. Here, data-centre/rack1 is the network 
location of dn1 and data-centre/rack1 is also
+          the network full path of rack1.
+          */
+          return rack.getNetworkFullPath();
+        })
         .filter(Objects::nonNull)
         .collect(Collectors.groupingBy(
             Function.identity(),
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/TestSCMCommonPlacementPolicy.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/TestSCMCommonPlacementPolicy.java
index b1f9a6f0f1c..dba2d60b98c 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/TestSCMCommonPlacementPolicy.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/TestSCMCommonPlacementPolicy.java
@@ -24,6 +24,8 @@
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotEquals;
 import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyInt;
 import static org.mockito.Mockito.eq;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
@@ -34,6 +36,7 @@
 import java.io.File;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
@@ -48,20 +51,25 @@
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.protocol.DatanodeDetails;
 import org.apache.hadoop.hdds.protocol.DatanodeID;
+import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
 import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
 import org.apache.hadoop.hdds.scm.container.ContainerID;
 import org.apache.hadoop.hdds.scm.container.ContainerReplica;
 import org.apache.hadoop.hdds.scm.container.MockNodeManager;
 import org.apache.hadoop.hdds.scm.exceptions.SCMException;
+import org.apache.hadoop.hdds.scm.net.NetworkTopology;
 import org.apache.hadoop.hdds.scm.net.Node;
 import org.apache.hadoop.hdds.scm.node.DatanodeInfo;
 import org.apache.hadoop.hdds.scm.node.NodeManager;
 import org.apache.hadoop.hdds.scm.node.NodeStatus;
+import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
 import org.apache.hadoop.ozone.container.common.SCMTestUtils;
 import org.apache.ozone.test.GenericTestUtils;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
+import org.mockito.Mockito;
 
 /**
  * Test functions of SCMCommonPlacementPolicy.
@@ -520,6 +528,44 @@ public void 
testDatanodeIsInvalidInCaseOfIncreasingCommittedBytes() {
     assertFalse(placementPolicy.isValidNode(datanodeDetails, 100, 4000));
   }
 
+  /**
+   * Tests that the placement validation logic is able to figure out a dead 
maintenance node's rack using
+   * {@link DatanodeDetails#getNetworkLocation()}. So when there are three 
datanodes, two on one rack and the dead +
+   * maintenance one on another rack (for a ratis container), the placement is 
valid. It is expected that the
+   * maintenance node will return to the cluster later.
+   */
+  @Test
+  public void testValidatePlacementWithDeadMaintenanceNode() throws 
NodeNotFoundException {
+    DatanodeDetails maintenanceDn = 
MockDatanodeDetails.randomDatanodeDetails();
+    // create 4 Datanodes: 2 in-service healthy + 1 extra in-service healthy + 
1 dead and in-maintenance
+    List<DatanodeDetails> allNodes = 
ImmutableList.of(MockDatanodeDetails.randomDatanodeDetails(),
+        MockDatanodeDetails.randomDatanodeDetails(), 
MockDatanodeDetails.randomDatanodeDetails(), maintenanceDn);
+    Map<Integer, Integer> datanodeRackMap = new HashMap<>();
+    // dead, in-maintenance dn does not get any rack to simulate that it was 
removed from topology on dying
+    datanodeRackMap.put(0, 0); // dn0 on rack 0
+    datanodeRackMap.put(1, 0); // dn1 on rack 1
+    datanodeRackMap.put(2, 1); // dn2 (extra) on rack 2
+    NodeManager mockNodeManager = Mockito.mock(NodeManager.class);
+    
when(mockNodeManager.getNodeStatus(any(DatanodeDetails.class))).thenAnswer(invocation
 -> {
+      DatanodeDetails dn = invocation.getArgument(0);
+      if (dn.equals(maintenanceDn)) {
+        return 
NodeStatus.valueOf(HddsProtos.NodeOperationalState.IN_MAINTENANCE, 
HddsProtos.NodeState.DEAD);
+      }
+      return NodeStatus.inServiceHealthy();
+    });
+    when(mockNodeManager.getAllNodes()).thenAnswer(inv -> allNodes);
+
+    NetworkTopology topology = mock(NetworkTopology.class);
+    when(topology.getMaxLevel()).thenReturn(3); // leaf level
+    when(topology.getNumOfNodes(anyInt())).thenReturn(2); // total racks in 
the cluster
+    when(mockNodeManager.getClusterNetworkTopologyMap()).thenReturn(topology);
+
+    DummyPlacementPolicy placementPolicy = new 
DummyPlacementPolicy(mockNodeManager, conf, datanodeRackMap, 2);
+    ContainerPlacementStatus placementStatus = 
placementPolicy.validateContainerPlacement(
+        ImmutableList.of(allNodes.get(0), allNodes.get(1), allNodes.get(3)), 
3);
+    assertTrue(placementStatus.isPolicySatisfied());
+  }
+
   private static class DummyPlacementPolicy extends SCMCommonPlacementPolicy {
     private Map<DatanodeDetails, Node> rackMap;
     private List<Node> racks;
@@ -551,7 +597,11 @@ private static class DummyPlacementPolicy extends 
SCMCommonPlacementPolicy {
       super(nodeManager, conf);
       this.rackCnt = rackCnt;
       this.racks = IntStream.range(0, rackCnt)
-      .mapToObj(i -> mock(Node.class)).collect(Collectors.toList());
+      .mapToObj(i -> {
+        Node node = mock(Node.class);
+        when(node.getNetworkFullPath()).thenReturn(String.valueOf(i));
+        return node;
+      }).collect(Collectors.toList());
       final List<? extends DatanodeDetails> datanodeDetails = 
nodeManager.getAllNodes();
       rackMap = datanodeRackMap.entrySet().stream()
               .collect(Collectors.toMap(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to