This is an automated email from the ASF dual-hosted git repository.

agupta pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new c7117dcc17 HDDS-11974. Split Container Safemode Rule into Ratis & EC 
Container Safemode Rules (#7951)
c7117dcc17 is described below

commit c7117dcc1731a5f8a82fc6f06b99bda9cd6e01c0
Author: Peter Lee <[email protected]>
AuthorDate: Tue Apr 15 16:07:20 2025 +0800

    HDDS-11974. Split Container Safemode Rule into Ratis & EC Container 
Safemode Rules (#7951)
---
 .../hdds/scm/safemode/ContainerSafeModeRule.java   | 359 ---------------------
 .../hdds/scm/safemode/ECContainerSafeModeRule.java | 249 ++++++++++++++
 .../scm/safemode/RatisContainerSafeModeRule.java   | 201 ++++++++++++
 .../hdds/scm/safemode/SCMSafeModeManager.java      |  18 +-
 .../hdds/scm/safemode/SafeModeRuleFactory.java     |   7 +-
 .../hdds/scm/safemode/TestSCMSafeModeManager.java  |  29 +-
 .../hdds/scm/safemode/TestSafeModeRuleFactory.java |   2 +-
 7 files changed, 485 insertions(+), 380 deletions(-)

diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
deleted file mode 100644
index 4fec2e4a17..0000000000
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hdds.scm.safemode;
-
-import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT;
-import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Sets;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.UUID;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.atomic.AtomicLong;
-import java.util.stream.Collectors;
-import org.apache.hadoop.hdds.client.ReplicationConfig;
-import org.apache.hadoop.hdds.conf.ConfigurationSource;
-import org.apache.hadoop.hdds.protocol.DatanodeDetails;
-import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
-import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
-import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
-import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
-import org.apache.hadoop.hdds.scm.container.ContainerID;
-import org.apache.hadoop.hdds.scm.container.ContainerInfo;
-import org.apache.hadoop.hdds.scm.container.ContainerManager;
-import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
-import org.apache.hadoop.hdds.scm.events.SCMEvents;
-import 
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
-import org.apache.hadoop.hdds.server.events.EventQueue;
-import org.apache.hadoop.hdds.server.events.TypedEvent;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Class defining Safe mode exit criteria for Containers.
- */
-public class ContainerSafeModeRule extends
-    SafeModeExitRule<NodeRegistrationContainerReport> {
-
-  private static final Logger LOG = 
LoggerFactory.getLogger(ContainerSafeModeRule.class);
-  
-  private static final String NAME = "ContainerSafeModeRule";
-
-  private final ContainerManager containerManager;
-  // Required cutoff % for containers with at least 1 reported replica.
-  private final double safeModeCutoff;
-  // Containers read from scm db (excluding containers in ALLOCATED state).
-  private final Set<Long> ratisContainers;
-  private final Set<Long> ecContainers;
-  private final Map<Long, Set<UUID>> ecContainerDNsMap;
-  private final AtomicLong ratisContainerWithMinReplicas = new AtomicLong(0);
-  private final AtomicLong ecContainerWithMinReplicas = new AtomicLong(0);
-
-  private double ratisMaxContainer;
-  private double ecMaxContainer;
-
-  public ContainerSafeModeRule(final EventQueue eventQueue,
-                               final ConfigurationSource conf,
-                               final ContainerManager containerManager,
-                               final SCMSafeModeManager manager) {
-    super(manager, NAME, eventQueue);
-    this.safeModeCutoff = getSafeModeCutoff(conf);
-    this.containerManager = containerManager;
-    this.ratisContainers = new HashSet<>();
-    this.ecContainers = new HashSet<>();
-    this.ecContainerDNsMap = new ConcurrentHashMap<>();
-    initializeRule();
-  }
-
-
-  private static double getSafeModeCutoff(ConfigurationSource conf) {
-    final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
-        HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT);
-    Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0),
-        HDDS_SCM_SAFEMODE_THRESHOLD_PCT  +
-            " value should be >= 0.0 and <= 1.0");
-    return cutoff;
-  }
-
-  @Override
-  protected TypedEvent<NodeRegistrationContainerReport> getEventType() {
-    return SCMEvents.CONTAINER_REGISTRATION_REPORT;
-  }
-
-  @Override
-  protected synchronized boolean validate() {
-    if (validateBasedOnReportProcessing()) {
-      return (getCurrentContainerThreshold() >= safeModeCutoff) &&
-          (getCurrentECContainerThreshold() >= safeModeCutoff);
-    }
-
-    // TODO: Split ContainerSafeModeRule into RatisContainerSafeModeRule and
-    //   ECContainerSafeModeRule
-    final List<ContainerInfo> containers = containerManager.getContainers(
-        ReplicationType.RATIS);
-
-    return containers.stream()
-        .filter(this::isClosed)
-        .map(ContainerInfo::containerID)
-        .noneMatch(this::isMissing);
-  }
-
-  /**
-   * Checks if the container has any replica.
-   */
-  private boolean isMissing(ContainerID id) {
-    try {
-      return containerManager.getContainerReplicas(id).isEmpty();
-    } catch (ContainerNotFoundException ex) {
-      /*
-       * This should never happen, in case this happens the container
-       * somehow got removed from SCM.
-       * Safemode rule doesn't have to log/fix this. We will just exclude this
-       * from the rule validation.
-       */
-      return false;
-
-    }
-  }
-
-  @VisibleForTesting
-  public double getCurrentContainerThreshold() {
-    return ratisMaxContainer == 0 ? 1 :
-        (ratisContainerWithMinReplicas.doubleValue() / ratisMaxContainer);
-  }
-
-  @VisibleForTesting
-  public double getCurrentECContainerThreshold() {
-    return ecMaxContainer == 0 ? 1 :
-        (ecContainerWithMinReplicas.doubleValue() / ecMaxContainer);
-  }
-
-
-  // TODO: Report processing logic will be removed in future. HDDS-11958.
-  @Override
-  protected synchronized void process(
-      final NodeRegistrationContainerReport reportsProto) {
-    final DatanodeDetails datanodeDetails = reportsProto.getDatanodeDetails();
-    final UUID datanodeUUID = datanodeDetails.getUuid();
-    StorageContainerDatanodeProtocolProtos.ContainerReportsProto report = 
reportsProto.getReport();
-
-    report.getReportsList().forEach(c -> {
-      long containerID = c.getContainerID();
-
-
-      // If it is a Ratis container.
-      if (ratisContainers.contains(containerID)) {
-        recordReportedContainer(containerID, Boolean.FALSE);
-        ratisContainers.remove(containerID);
-      }
-
-      // If it is an EC container.
-      if (ecContainers.contains(containerID)) {
-        putInContainerDNsMap(containerID, ecContainerDNsMap, datanodeUUID);
-        recordReportedContainer(containerID, Boolean.TRUE);
-      }
-    });
-
-    if (scmInSafeMode()) {
-      SCMSafeModeManager.getLogger().info(
-          "SCM in safe mode. {} % containers [Ratis] have at least one"
-          + " reported replica, {} % containers [EC] have at N reported 
replica.",
-          getCurrentContainerThreshold() * 100, 
getCurrentECContainerThreshold() * 100);
-    }
-  }
-
-  /**
-   * Record the reported Container.
-   *
-   * We will differentiate and count according to the type of Container.
-   *
-   * @param containerID containerID
-   * @param isEcContainer true, means ECContainer, false, means not 
ECContainer.
-   */
-  private void recordReportedContainer(long containerID, boolean 
isEcContainer) {
-
-    int uuids = 1;
-    if (isEcContainer && ecContainerDNsMap.containsKey(containerID)) {
-      uuids = ecContainerDNsMap.get(containerID).size();
-    }
-
-    int minReplica = getMinReplica(containerID);
-    if (uuids >= minReplica) {
-      if (isEcContainer) {
-        getSafeModeMetrics()
-            .incCurrentContainersWithECDataReplicaReportedCount();
-        ecContainerWithMinReplicas.getAndAdd(1);
-      } else {
-        ratisContainerWithMinReplicas.getAndAdd(1);
-        getSafeModeMetrics()
-            .incCurrentContainersWithOneReplicaReportedCount();
-      }
-    }
-  }
-
-  /**
-   * Get the minimum replica.
-   *
-   * If it is a Ratis Contianer, the minimum copy is 1.
-   * If it is an EC Container, the minimum copy will be the number of Data in 
replicationConfig.
-   *
-   * @param pContainerID containerID
-   * @return MinReplica.
-   */
-  private int getMinReplica(long pContainerID) {
-
-    try {
-      ContainerID containerID = ContainerID.valueOf(pContainerID);
-      ContainerInfo container = containerManager.getContainer(containerID);
-      ReplicationConfig replicationConfig = container.getReplicationConfig();
-      return replicationConfig.getMinimumNodes();
-    } catch (ContainerNotFoundException e) {
-      LOG.error("containerId = {} not found.", pContainerID, e);
-    } catch (Exception e) {
-      LOG.error("containerId = {} not found.", pContainerID, e);
-    }
-
-    return 1;
-  }
-
-  private void putInContainerDNsMap(long containerID, Map<Long, Set<UUID>> 
containerDNsMap,
-      UUID datanodeUUID) {
-    containerDNsMap.computeIfAbsent(containerID, key -> Sets.newHashSet());
-    containerDNsMap.get(containerID).add(datanodeUUID);
-  }
-
-  @Override
-  protected synchronized void cleanup() {
-    ratisContainers.clear();
-    ecContainers.clear();
-    ecContainerDNsMap.clear();
-  }
-
-  @Override
-  public String getStatusText() {
-
-    // ratis container
-    String status = String.format(
-        "%1.2f%% of [Ratis] Containers(%s / %s) with at least one reported 
replica (=%1.2f) >= " +
-        "safeModeCutoff (=%1.2f);",
-        getCurrentContainerThreshold() * 100,
-        ratisContainerWithMinReplicas, (long) ratisMaxContainer,
-        getCurrentContainerThreshold(), this.safeModeCutoff);
-
-    Set<Long> sampleRatisContainers = ratisContainers.stream().
-        limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).
-        collect(Collectors.toSet());
-
-    if (!sampleRatisContainers.isEmpty()) {
-      String sampleContainerText =
-          "Sample Ratis Containers not satisfying the criteria : " + 
sampleRatisContainers + ";";
-      status = status.concat("\n").concat(sampleContainerText);
-    }
-
-    // ec container
-    String ecStatus = String.format(
-        "%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica 
(=%1.2f) >= " +
-        "safeModeCutoff (=%1.2f);",
-        getCurrentECContainerThreshold() * 100,
-        ecContainerWithMinReplicas, (long) ecMaxContainer,
-        getCurrentECContainerThreshold(), this.safeModeCutoff);
-    status = status.concat("\n").concat(ecStatus);
-
-    Set<Long> sampleEcContainers = ecContainerDNsMap.entrySet().stream().
-        filter(entry -> {
-          Long containerId = entry.getKey();
-          int minReplica = getMinReplica(containerId);
-          Set<UUID> allReplicas = entry.getValue();
-          if (allReplicas.size() >= minReplica) {
-            return false;
-          }
-          return true;
-        }).
-        map(Map.Entry::getKey).
-        limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).
-        collect(Collectors.toSet());
-
-    if (!sampleEcContainers.isEmpty()) {
-      String sampleECContainerText =
-          "Sample EC Containers not satisfying the criteria : " + 
sampleEcContainers + ";";
-      status = status.concat("\n").concat(sampleECContainerText);
-    }
-
-    return status;
-  }
-
-
-  @Override
-  public synchronized void refresh(boolean forceRefresh) {
-    if (forceRefresh || !validate()) {
-      initializeRule();
-    }
-  }
-
-  private boolean isClosed(ContainerInfo container) {
-    final LifeCycleState state = container.getState();
-    return state == LifeCycleState.QUASI_CLOSED ||
-        state == LifeCycleState.CLOSED;
-  }
-
-  private void initializeRule() {
-    final List<ContainerInfo> containers = containerManager.getContainers();
-    // Clean up the related data in the map.
-    ratisContainers.clear();
-    ecContainers.clear();
-
-    // Iterate through the container list to
-    // get the minimum replica count for each container.
-    containers.forEach(container -> {
-      // There can be containers in OPEN/CLOSING state which were never
-      // created by the client. We are not considering these containers for
-      // now. These containers can be handled by tracking pipelines.
-
-      HddsProtos.ReplicationType replicationType = 
container.getReplicationType();
-
-      if (isClosed(container) && container.getNumberOfKeys() > 0) {
-        // If it's of type Ratis
-        if (replicationType.equals(HddsProtos.ReplicationType.RATIS)) {
-          ratisContainers.add(container.getContainerID());
-        }
-
-        // If it's of type EC
-        if (replicationType.equals(HddsProtos.ReplicationType.EC)) {
-          ecContainers.add(container.getContainerID());
-        }
-      }
-    });
-
-    ratisMaxContainer = ratisContainers.size();
-    ecMaxContainer = ecContainers.size();
-
-    long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff);
-    long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff);
-
-    
getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff);
-    
getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff);
-
-    LOG.info("Refreshed Containers with one replica threshold count {}, " +
-        "with ec n replica threshold count {}.", ratisCutOff, ecCutOff);
-  }
-}
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java
new file mode 100644
index 0000000000..a6c30aace9
--- /dev/null
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.safemode;
+
+import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT;
+import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Sets;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+import org.apache.hadoop.hdds.client.ReplicationConfig;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerManager;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
+import org.apache.hadoop.hdds.scm.events.SCMEvents;
+import 
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
+import org.apache.hadoop.hdds.server.events.EventQueue;
+import org.apache.hadoop.hdds.server.events.TypedEvent;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Safe mode rule for EC containers.
+ */
+public class ECContainerSafeModeRule extends 
SafeModeExitRule<NodeRegistrationContainerReport> {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(ECContainerSafeModeRule.class);
+  private static final String NAME = "ECContainerSafeModeRule";
+  private static final int DEFAULT_MIN_REPLICA = 1;
+
+  private final ContainerManager containerManager;
+  private final double safeModeCutoff;
+  private final Set<Long> ecContainers;
+  private final Map<Long, Set<UUID>> ecContainerDNsMap;
+  private final AtomicLong ecContainerWithMinReplicas;
+  private double ecMaxContainer;
+
+  public ECContainerSafeModeRule(EventQueue eventQueue,
+      ConfigurationSource conf,
+      ContainerManager containerManager,
+      SCMSafeModeManager manager) {
+    super(manager, NAME, eventQueue);
+    this.safeModeCutoff = getSafeModeCutoff(conf);
+    this.containerManager = containerManager;
+    this.ecContainers = new HashSet<>();
+    this.ecContainerDNsMap = new ConcurrentHashMap<>();
+    this.ecContainerWithMinReplicas = new AtomicLong(0);
+    initializeRule();
+  }
+
+  private static double getSafeModeCutoff(ConfigurationSource conf) {
+    final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
+        HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT);
+    Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0),
+        HDDS_SCM_SAFEMODE_THRESHOLD_PCT + " value should be >= 0.0 and <= 
1.0");
+    return cutoff;
+  }
+
+  @Override
+  protected TypedEvent<NodeRegistrationContainerReport> getEventType() {
+    return SCMEvents.CONTAINER_REGISTRATION_REPORT;
+  }
+
+  @Override
+  protected synchronized boolean validate() {
+    if (validateBasedOnReportProcessing()) {
+      return getCurrentContainerThreshold() >= safeModeCutoff;
+    }
+
+    final List<ContainerInfo> containers = containerManager.getContainers(
+        ReplicationType.EC);
+
+    return containers.stream()
+        .filter(this::isClosed)
+        .map(ContainerInfo::containerID)
+        .noneMatch(this::isMissing);
+  }
+
+  /**
+   * Checks if the container has at least the minimum required number of 
replicas.
+   */
+  private boolean isMissing(ContainerID id) {
+    try {
+      int minReplica = getMinReplica(id.getId());
+      return containerManager.getContainerReplicas(id).size() < minReplica;
+    } catch (ContainerNotFoundException ex) {
+      /*
+       * This should never happen, in case this happens the container
+       * somehow got removed from SCM.
+       * Safemode rule doesn't have to log/fix this. We will just exclude this
+       * from the rule validation.
+       */
+      return false;
+    }
+  }
+
+  @VisibleForTesting
+  public double getCurrentContainerThreshold() {
+    return ecMaxContainer == 0 ? 1 : (ecContainerWithMinReplicas.doubleValue() 
/ ecMaxContainer);
+  }
+
+  /**
+   * Get the minimum replica.
+   *
+   * @param pContainerID containerID
+   * @return MinReplica.
+   */
+  private int getMinReplica(long pContainerID) {
+    try {
+      ContainerID containerID = ContainerID.valueOf(pContainerID);
+      ContainerInfo container = containerManager.getContainer(containerID);
+      ReplicationConfig replicationConfig = container.getReplicationConfig();
+      return replicationConfig.getMinimumNodes();
+    } catch (Exception e) {
+      LOG.error("containerId = {} not found.", pContainerID, e);
+    }
+
+    return DEFAULT_MIN_REPLICA;
+  }
+
+  @Override
+  protected void process(NodeRegistrationContainerReport report) {
+    DatanodeDetails datanodeDetails = report.getDatanodeDetails();
+    UUID datanodeUUID = datanodeDetails.getUuid();
+
+    report.getReport().getReportsList().forEach(c -> {
+      long containerID = c.getContainerID();
+      if (ecContainers.contains(containerID)) {
+        putInContainerDNsMap(containerID, ecContainerDNsMap, datanodeUUID);
+        recordReportedContainer(containerID);
+      }
+    });
+
+    if (scmInSafeMode()) {
+      SCMSafeModeManager.getLogger().info(
+          "SCM in safe mode. {} % containers [EC] have at N reported replica",
+          getCurrentContainerThreshold() * 100);
+    }
+  }
+
+  private void putInContainerDNsMap(long containerID,
+      Map<Long, Set<UUID>> containerDNsMap,
+      UUID datanodeUUID) {
+    containerDNsMap.computeIfAbsent(containerID, key -> 
Sets.newHashSet()).add(datanodeUUID);
+  }
+
+  /**
+   * Record the reported Container.
+   *
+   * @param containerID containerID
+   */
+  private void recordReportedContainer(long containerID) {
+
+    int uuids = 1;
+    if (ecContainerDNsMap.containsKey(containerID)) {
+      uuids = ecContainerDNsMap.get(containerID).size();
+    }
+
+    int minReplica = getMinReplica(containerID);
+    if (uuids >= minReplica) {
+      getSafeModeMetrics()
+          .incCurrentContainersWithECDataReplicaReportedCount();
+      ecContainerWithMinReplicas.getAndAdd(1);
+    }
+  }
+
+  private void initializeRule() {
+    ecContainers.clear();
+    ecContainerDNsMap.clear();
+    containerManager.getContainers(ReplicationType.EC).stream()
+        .filter(this::isClosed).filter(c -> c.getNumberOfKeys() > 0)
+        .map(ContainerInfo::getContainerID).forEach(ecContainers::add);
+    ecMaxContainer = ecContainers.size();
+    long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff);
+    
getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff);
+
+    LOG.info("Refreshed Containers with ec n replica threshold count {}.", 
ecCutOff);
+  }
+
+  private boolean isClosed(ContainerInfo container) {
+    final LifeCycleState state = container.getState();
+    return state == LifeCycleState.QUASI_CLOSED || state == 
LifeCycleState.CLOSED;
+  }
+
+  @Override
+  public String getStatusText() {
+    String status = String.format(
+        "%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica 
(=%1.2f) >= " +
+            "safeModeCutoff (=%1.2f);",
+        getCurrentContainerThreshold() * 100,
+        ecContainerWithMinReplicas, (long) ecMaxContainer,
+        getCurrentContainerThreshold(), this.safeModeCutoff);
+
+    Set<Long> sampleEcContainers = 
ecContainerDNsMap.entrySet().stream().filter(entry -> {
+      Long containerId = entry.getKey();
+      int minReplica = getMinReplica(containerId);
+      Set<UUID> allReplicas = entry.getValue();
+      return allReplicas.size() < minReplica;
+    
}).map(Map.Entry::getKey).limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).collect(Collectors.toSet());
+
+    if (!sampleEcContainers.isEmpty()) {
+      String sampleECContainerText = "Sample EC Containers not satisfying the 
criteria : " + sampleEcContainers + ";";
+      status = status.concat("\n").concat(sampleECContainerText);
+    }
+
+    return status;
+  }
+
+  @Override
+  public synchronized void refresh(boolean forceRefresh) {
+    if (forceRefresh || !validate()) {
+      initializeRule();
+    }
+  }
+
+  @Override
+  protected void cleanup() {
+    ecContainers.clear();
+    ecContainerDNsMap.clear();
+  }
+}
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java
new file mode 100644
index 0000000000..4015fd81b4
--- /dev/null
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.safemode;
+
+import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT;
+import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerManager;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
+import org.apache.hadoop.hdds.scm.events.SCMEvents;
+import 
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
+import org.apache.hadoop.hdds.server.events.EventQueue;
+import org.apache.hadoop.hdds.server.events.TypedEvent;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * Class defining Safe mode exit criteria for Ratis Containers.
+ */
+public class RatisContainerSafeModeRule extends 
SafeModeExitRule<NodeRegistrationContainerReport> {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(RatisContainerSafeModeRule.class);
+  private static final String NAME = "RatisContainerSafeModeRule";
+
+  private final ContainerManager containerManager;
+  // Required cutoff % for containers with at least 1 reported replica.
+  private final double safeModeCutoff;
+  // Containers read from scm db (excluding containers in ALLOCATED state).
+  private final Set<Long> ratisContainers;
+  private final AtomicLong ratisContainerWithMinReplicas;
+  private double ratisMaxContainer;
+
+  public RatisContainerSafeModeRule(EventQueue eventQueue,
+      ConfigurationSource conf,
+      ContainerManager containerManager,
+      SCMSafeModeManager manager) {
+    super(manager, NAME, eventQueue);
+    this.safeModeCutoff = getSafeModeCutoff(conf);
+    this.containerManager = containerManager;
+    this.ratisContainers = new HashSet<>();
+    this.ratisContainerWithMinReplicas = new AtomicLong(0);
+    initializeRule();
+  }
+
+  private static double getSafeModeCutoff(ConfigurationSource conf) {
+    final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
+        HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT);
+    Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0),
+        HDDS_SCM_SAFEMODE_THRESHOLD_PCT + " value should be >= 0.0 and <= 
1.0");
+    return cutoff;
+  }
+
+  @Override
+  protected TypedEvent<NodeRegistrationContainerReport> getEventType() {
+    return SCMEvents.CONTAINER_REGISTRATION_REPORT;
+  }
+
+  @Override
+  protected synchronized boolean validate() {
+    if (validateBasedOnReportProcessing()) {
+      return (getCurrentContainerThreshold() >= safeModeCutoff);
+    }
+
+    final List<ContainerInfo> containers = containerManager.getContainers(
+        ReplicationType.RATIS);
+
+    return containers.stream()
+        .filter(this::isClosed)
+        .map(ContainerInfo::containerID)
+        .noneMatch(this::isMissing);
+  }
+
+  /**
+   * Checks if the container has any replica.
+   */
+  private boolean isMissing(ContainerID id) {
+    try {
+      return containerManager.getContainerReplicas(id).isEmpty();
+    } catch (ContainerNotFoundException ex) {
+      /*
+       * This should never happen, in case this happens the container
+       * somehow got removed from SCM.
+       * Safemode rule doesn't have to log/fix this. We will just exclude this
+       * from the rule validation.
+       */
+      return false;
+
+    }
+  }
+
+  @VisibleForTesting
+  public double getCurrentContainerThreshold() {
+    return ratisMaxContainer == 0 ? 1 : 
(ratisContainerWithMinReplicas.doubleValue() / ratisMaxContainer);
+  }
+
+  @Override
+  protected void process(NodeRegistrationContainerReport report) {
+    report.getReport().getReportsList().forEach(c -> {
+      long containerID = c.getContainerID();
+      if (ratisContainers.contains(containerID)) {
+        recordReportedContainer(containerID);
+        ratisContainers.remove(containerID);
+      }
+    });
+
+    if (scmInSafeMode()) {
+      SCMSafeModeManager.getLogger().info(
+          "SCM in safe mode. {} % containers [Ratis] have at least one 
reported replica",
+          String.format("%.2f", getCurrentContainerThreshold() * 100));
+    }
+  }
+
+  /**
+   * Record the reported Container.
+   *
+   * @param containerID containerID
+   */
+  private void recordReportedContainer(long containerID) {
+    ratisContainerWithMinReplicas.getAndAdd(1);
+    getSafeModeMetrics()
+        .incCurrentContainersWithOneReplicaReportedCount();
+  }
+
+  private void initializeRule() {
+    ratisContainers.clear();
+    containerManager.getContainers(ReplicationType.RATIS).stream()
+        .filter(this::isClosed).filter(c -> c.getNumberOfKeys() > 0)
+        .map(ContainerInfo::getContainerID).forEach(ratisContainers::add);
+    ratisMaxContainer = ratisContainers.size();
+    long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff);
+    
getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff);
+
+    LOG.info("Refreshed Containers with one replica threshold count {}.", 
ratisCutOff);
+  }
+
+  private boolean isClosed(ContainerInfo container) {
+    final LifeCycleState state = container.getState();
+    return state == LifeCycleState.QUASI_CLOSED || state == 
LifeCycleState.CLOSED;
+  }
+
+  @Override
+  public String getStatusText() {
+    String status = String.format(
+        "%1.2f%% of [Ratis] Containers(%s / %s) with at least one reported 
replica (=%1.2f) >= " +
+            "safeModeCutoff (=%1.2f);",
+        getCurrentContainerThreshold() * 100,
+        ratisContainerWithMinReplicas, (long) ratisMaxContainer,
+        getCurrentContainerThreshold(), this.safeModeCutoff);
+
+    Set<Long> sampleRatisContainers = 
ratisContainers.stream().limit(SAMPLE_CONTAINER_DISPLAY_LIMIT)
+        .collect(Collectors.toSet());
+
+    if (!sampleRatisContainers.isEmpty()) {
+      String sampleContainerText = "Sample Ratis Containers not satisfying the 
criteria : " + sampleRatisContainers
+          + ";";
+      status = status.concat("\n").concat(sampleContainerText);
+    }
+
+    return status;
+  }
+
+  @Override
+  public synchronized void refresh(boolean forceRefresh) {
+    if (forceRefresh || !validate()) {
+      initializeRule();
+    }
+  }
+
+  @Override
+  protected void cleanup() {
+    ratisContainers.clear();
+  }
+}
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
index 6378459925..9dd79ca815 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
@@ -91,8 +91,9 @@ public class SCMSafeModeManager implements SafeModeManager {
   private Map<String, SafeModeExitRule> exitRules = new HashMap<>(1);
   private Set<String> preCheckRules = new HashSet<>(1);
   private ConfigurationSource config;
+  private static final String RATIS_CONTAINER_EXIT_RULE = 
"RatisContainerSafeModeRule";
+  private static final String EC_CONTAINER_EXIT_RULE = 
"ECContainerSafeModeRule";
   private static final String DN_EXIT_RULE = "DataNodeSafeModeRule";
-  private static final String CONT_EXIT_RULE = "ContainerSafeModeRule";
   private static final String HEALTHY_PIPELINE_EXIT_RULE =
       "HealthyPipelineSafeModeRule";
   private static final String ATLEAST_ONE_DATANODE_REPORTED_PIPELINE_EXIT_RULE 
=
@@ -319,19 +320,24 @@ public static Logger getLogger() {
 
   @VisibleForTesting
   public double getCurrentContainerThreshold() {
-    return ((ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE))
+    return ((RatisContainerSafeModeRule) 
exitRules.get(RATIS_CONTAINER_EXIT_RULE))
         .getCurrentContainerThreshold();
   }
 
   @VisibleForTesting
   public double getCurrentECContainerThreshold() {
-    return ((ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE))
-        .getCurrentECContainerThreshold();
+    return ((ECContainerSafeModeRule) exitRules.get(EC_CONTAINER_EXIT_RULE))
+        .getCurrentContainerThreshold();
+  }
+
+  @VisibleForTesting
+  public RatisContainerSafeModeRule getRatisContainerSafeModeRule() {
+    return (RatisContainerSafeModeRule) 
exitRules.get(RATIS_CONTAINER_EXIT_RULE);
   }
 
   @VisibleForTesting
-  public ContainerSafeModeRule getContainerSafeModeRule() {
-    return (ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE);
+  public ECContainerSafeModeRule getECContainerSafeModeRule() {
+    return (ECContainerSafeModeRule) exitRules.get(EC_CONTAINER_EXIT_RULE);
   }
 
   @VisibleForTesting
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
index 65be38ae6e..96c22d06e7 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
@@ -68,12 +68,15 @@ private SafeModeRuleFactory(final ConfigurationSource 
config,
 
   private void loadRules() {
     // TODO: Use annotation to load the rules. (HDDS-11730)
-    SafeModeExitRule<?> containerRule = new ContainerSafeModeRule(eventQueue, 
+    SafeModeExitRule<?> ratisContainerRule = new 
RatisContainerSafeModeRule(eventQueue,
+        config, containerManager, safeModeManager);
+    SafeModeExitRule<?> ecContainerRule = new 
ECContainerSafeModeRule(eventQueue,
         config, containerManager, safeModeManager);
     SafeModeExitRule<?> datanodeRule = new DataNodeSafeModeRule(eventQueue, 
         config, nodeManager, safeModeManager);
 
-    safeModeRules.add(containerRule);
+    safeModeRules.add(ratisContainerRule);
+    safeModeRules.add(ecContainerRule);
     safeModeRules.add(datanodeRule);
 
     preCheckRules.add(datanodeRule);
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
index f693df81dc..7a43792eb5 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
@@ -44,6 +44,7 @@
 import org.apache.hadoop.hdds.protocol.DatanodeDetails;
 import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
 import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
 import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
 import org.apache.hadoop.hdds.scm.HddsTestUtils;
 import org.apache.hadoop.hdds.scm.container.ContainerInfo;
@@ -130,7 +131,7 @@ private void testSafeMode(int numContainers) throws 
Exception {
       container.setNumberOfKeys(10);
     }
     ContainerManager containerManager = mock(ContainerManager.class);
-    when(containerManager.getContainers()).thenReturn(containers);
+    
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
     scmSafeModeManager = new SCMSafeModeManager(
         config, containerManager, null, null, queue,
         serviceManager, scmContext);
@@ -169,7 +170,7 @@ public void testSafeModeExitRule() throws Exception {
       container.setNumberOfKeys(10);
     }
     ContainerManager containerManager = mock(ContainerManager.class);
-    when(containerManager.getContainers()).thenReturn(containers);
+    
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
     scmSafeModeManager = new SCMSafeModeManager(
         config, containerManager, null, null, queue,
         serviceManager, scmContext);
@@ -235,7 +236,7 @@ public void 
testHealthyPipelinePercentWithIncorrectValue(double healthyPercent,
         serviceManager,
         Clock.system(ZoneOffset.UTC));
     ContainerManager containerManager = mock(ContainerManager.class);
-    when(containerManager.getContainers()).thenReturn(containers);
+    
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
     IllegalArgumentException exception = 
assertThrows(IllegalArgumentException.class,
         () -> new SCMSafeModeManager(conf, containerManager,
             pipelineManager, mockNodeManager, queue, serviceManager, 
scmContext));
@@ -301,7 +302,7 @@ public void 
testSafeModeExitRuleWithPipelineAvailabilityCheck(
     }
 
     ContainerManager containerManager = mock(ContainerManager.class);
-    when(containerManager.getContainers()).thenReturn(containers);
+    
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
 
     scmSafeModeManager = new SCMSafeModeManager(
         conf, containerManager, pipelineManager, mockNodeManager, queue,
@@ -438,8 +439,8 @@ public void testDisableSafeMode() {
     conf.setBoolean(HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED, false);
     PipelineManager pipelineManager = mock(PipelineManager.class);
     ContainerManager containerManager = mock(ContainerManager.class);
+    
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
     NodeManager nodeManager = mock(SCMNodeManager.class);
-    when(containerManager.getContainers()).thenReturn(containers);
     scmSafeModeManager = new SCMSafeModeManager(
         conf, containerManager, pipelineManager, nodeManager, queue,
         serviceManager, scmContext);
@@ -480,7 +481,7 @@ public void testContainerSafeModeRule() throws Exception {
     }
 
     ContainerManager containerManager = mock(ContainerManager.class);
-    when(containerManager.getContainers()).thenReturn(containers);
+    
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
 
     scmSafeModeManager = new SCMSafeModeManager(
         config, containerManager, null, null, queue, serviceManager, 
scmContext);
@@ -572,16 +573,20 @@ public void testContainerSafeModeRuleEC(int data, int 
parity) throws Exception {
     // the threshold will reach 100%.
     testECContainerThreshold(containers.subList(10, 20), 1.0, data);
 
-    ContainerSafeModeRule containerSafeModeRule =
-        scmSafeModeManager.getContainerSafeModeRule();
-    assertTrue(containerSafeModeRule.validate());
+    ECContainerSafeModeRule ecContainerSafeModeRule =
+        scmSafeModeManager.getECContainerSafeModeRule();
+    assertTrue(ecContainerSafeModeRule.validate());
+
+    RatisContainerSafeModeRule ratisContainerSafeModeRule =
+        scmSafeModeManager.getRatisContainerSafeModeRule();
+    assertTrue(ratisContainerSafeModeRule.validate());
   }
 
   private void testSafeModeDataNodes(int numOfDns) throws Exception {
     OzoneConfiguration conf = new OzoneConfiguration(config);
     conf.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, numOfDns);
     ContainerManager containerManager = mock(ContainerManager.class);
-    when(containerManager.getContainers()).thenReturn(containers);
+    
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
     scmSafeModeManager = new SCMSafeModeManager(
         conf, containerManager, null, null, queue,
         serviceManager, scmContext);
@@ -689,7 +694,7 @@ public void testSafeModePipelineExitRule() throws Exception 
{
     pipeline = pipelineManager.getPipeline(pipeline.getId());
     MockRatisPipelineProvider.markPipelineHealthy(pipeline);
     ContainerManager containerManager = mock(ContainerManager.class);
-    when(containerManager.getContainers()).thenReturn(containers);
+    
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
 
     scmSafeModeManager = new SCMSafeModeManager(
           config, containerManager, pipelineManager, nodeManager, queue,
@@ -738,7 +743,7 @@ public void testPipelinesNotCreatedUntilPreCheckPasses() 
throws Exception {
         mockRatisProvider);
 
     ContainerManager containerManager = mock(ContainerManager.class);
-    when(containerManager.getContainers()).thenReturn(containers);
+    
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
 
     scmSafeModeManager = new SCMSafeModeManager(
         config, containerManager, pipelineManager, nodeManager, queue,
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
index 1868d9e3cb..35917039d5 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
@@ -55,7 +55,7 @@ public void testLoadedSafeModeRules() {
     // as the rules are hardcoded in SafeModeRuleFactory.
 
     // This will be fixed once we load rules using annotation.
-    assertEquals(4, factory.getSafeModeRules().size(),
+    assertEquals(5, factory.getSafeModeRules().size(),
         "The total safemode rules count doesn't match");
 
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to