This is an automated email from the ASF dual-hosted git repository.
siddhant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new a99ab2706e HDDS-11243. SCM SafeModeRule Support EC. (#7008)
a99ab2706e is described below
commit a99ab2706e8662d9749904b5fd082efe3774741f
Author: slfan1989 <[email protected]>
AuthorDate: Tue Nov 26 21:31:56 2024 +0800
HDDS-11243. SCM SafeModeRule Support EC. (#7008)
---
.../hadoop/hdds/client/ECReplicationConfig.java | 5 +
.../hadoop/hdds/client/RatisReplicationConfig.java | 5 +
.../hadoop/hdds/client/ReplicationConfig.java | 2 +
.../hdds/client/StandaloneReplicationConfig.java | 5 +
.../hdds/scm/container/ContainerReportHandler.java | 7 +
.../apache/hadoop/hdds/scm/events/SCMEvents.java | 7 +
.../hdds/scm/safemode/ContainerSafeModeRule.java | 281 ++++++++++++++++-----
.../hdds/scm/safemode/SCMSafeModeManager.java | 11 +
.../hadoop/hdds/scm/safemode/SafeModeMetrics.java | 16 ++
.../scm/server/SCMDatanodeHeartbeatDispatcher.java | 12 +
.../hdds/scm/server/SCMDatanodeProtocolServer.java | 2 +-
.../main/resources/webapps/scm/scm-overview.html | 2 +-
.../src/main/resources/webapps/scm/scm.js | 10 +-
.../org/apache/hadoop/hdds/scm/HddsTestUtils.java | 26 ++
.../hdds/scm/safemode/TestSCMSafeModeManager.java | 149 ++++++++++-
15 files changed, 461 insertions(+), 79 deletions(-)
diff --git
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ECReplicationConfig.java
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ECReplicationConfig.java
index a6dbd933ff..9709029634 100644
---
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ECReplicationConfig.java
+++
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ECReplicationConfig.java
@@ -218,6 +218,11 @@ public class ECReplicationConfig implements
ReplicationConfig {
+ "/" + data + "-" + parity + "-" + chunkKB();
}
+ @Override
+ public int getMinimumNodes() {
+ return data;
+ }
+
private String chunkKB() {
return ecChunkSize / 1024 + "k";
}
diff --git
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/RatisReplicationConfig.java
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/RatisReplicationConfig.java
index 36d4d90e1a..9c42e3d59b 100644
---
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/RatisReplicationConfig.java
+++
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/RatisReplicationConfig.java
@@ -126,4 +126,9 @@ public final class RatisReplicationConfig
public String configFormat() {
return toString();
}
+
+ @Override
+ public int getMinimumNodes() {
+ return 1;
+ }
}
diff --git
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ReplicationConfig.java
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ReplicationConfig.java
index 7542409679..d82cd08c08 100644
---
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ReplicationConfig.java
+++
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ReplicationConfig.java
@@ -234,4 +234,6 @@ public interface ReplicationConfig {
String configFormat();
+ /** Minimum number of nodes, below this data loss happens. */
+ int getMinimumNodes();
}
diff --git
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/StandaloneReplicationConfig.java
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/StandaloneReplicationConfig.java
index 9ca2dfb538..0b82ab8c87 100644
---
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/StandaloneReplicationConfig.java
+++
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/StandaloneReplicationConfig.java
@@ -128,4 +128,9 @@ public final class StandaloneReplicationConfig implements
public String configFormat() {
return toString();
}
+
+ @Override
+ public int getMinimumNodes() {
+ return replicationFactor.getNumber();
+ }
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java
index daadcd824e..36a51c4e3c 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java
@@ -26,11 +26,13 @@ import org.apache.hadoop.hdds.protocol.proto
.StorageContainerDatanodeProtocolProtos.ContainerReportsProto;
import org.apache.hadoop.hdds.scm.ScmConfig;
import org.apache.hadoop.hdds.scm.container.report.ContainerReportValidator;
+import org.apache.hadoop.hdds.scm.events.SCMEvents;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
import org.apache.hadoop.hdds.scm.server.SCMDatanodeHeartbeatDispatcher
.ContainerReportFromDatanode;
+import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer;
import org.apache.hadoop.hdds.server.events.EventHandler;
import org.apache.hadoop.hdds.server.events.EventPublisher;
import
org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
@@ -199,6 +201,11 @@ public class ContainerReportHandler extends
AbstractContainerReportHandler
// list
processMissingReplicas(datanodeDetails, expectedContainersInDatanode);
containerManager.notifyContainerReportProcessing(true, true);
+ if (reportFromDatanode.isRegister()) {
+ publisher.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT,
+ new
SCMDatanodeProtocolServer.NodeRegistrationContainerReport(datanodeDetails,
+ reportFromDatanode.getReport()));
+ }
}
} catch (NodeNotFoundException ex) {
containerManager.notifyContainerReportProcessing(true, false);
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java
index 4fcf130117..6c155a6ec5 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java
@@ -64,6 +64,13 @@ public final class SCMEvents {
NodeRegistrationContainerReport.class,
"Node_Registration_Container_Report");
+ /**
+ * Event generated on DataNode Registration Container Report.
+ */
+ public static final TypedEvent<NodeRegistrationContainerReport>
+ CONTAINER_REGISTRATION_REPORT = new TypedEvent<>(
+ NodeRegistrationContainerReport.class, "Container_Registration_Report");
+
/**
* ContainerReports are sent out by Datanodes. This report is received by
* SCMDatanodeHeartbeatDispatcher and Container_Report Event is generated.
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
index ae645858a3..accd805602 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
@@ -19,16 +19,25 @@ package org.apache.hadoop.hdds.scm.safemode;
import java.util.List;
import java.util.Map;
-import java.util.Optional;
+import java.util.UUID;
+import java.util.Set;
+import java.util.HashSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
+import com.google.common.collect.Sets;
import org.apache.hadoop.hdds.HddsConfigKeys;
+import org.apache.hadoop.hdds.client.ReplicationConfig;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
+import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerManager;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
import org.apache.hadoop.hdds.scm.events.SCMEvents;
import
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
import org.apache.hadoop.hdds.server.events.EventQueue;
@@ -50,10 +59,13 @@ public class ContainerSafeModeRule extends
// Required cutoff % for containers with at least 1 reported replica.
private double safeModeCutoff;
// Containers read from scm db (excluding containers in ALLOCATED state).
- private Map<Long, ContainerInfo> containerMap;
- private double maxContainer;
-
- private AtomicLong containerWithMinReplicas = new AtomicLong(0);
+ private Set<Long> ratisContainers;
+ private Set<Long> ecContainers;
+ private Map<Long, Set<UUID>> ecContainerDNsMap;
+ private double ratisMaxContainer;
+ private double ecMaxContainer;
+ private AtomicLong ratisContainerWithMinReplicas = new AtomicLong(0);
+ private AtomicLong ecContainerWithMinReplicas = new AtomicLong(0);
private final ContainerManager containerManager;
public ContainerSafeModeRule(String ruleName, EventQueue eventQueue,
@@ -71,127 +83,268 @@ public class ContainerSafeModeRule extends
HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT +
" value should be >= 0.0 and <= 1.0");
- containerMap = new ConcurrentHashMap<>();
- containers.forEach(container -> {
- // There can be containers in OPEN/CLOSING state which were never
- // created by the client. We are not considering these containers for
- // now. These containers can be handled by tracking pipelines.
-
- Optional.ofNullable(container.getState())
- .filter(state -> (state == HddsProtos.LifeCycleState.QUASI_CLOSED ||
- state == HddsProtos.LifeCycleState.CLOSED)
- && container.getNumberOfKeys() > 0)
- .ifPresent(s -> containerMap.put(container.getContainerID(),
- container));
- });
- maxContainer = containerMap.size();
- long cutOff = (long) Math.ceil(maxContainer * safeModeCutoff);
-
getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(cutOff);
+ ratisContainers = new HashSet<>();
+ ecContainers = new HashSet<>();
+ ecContainerDNsMap = new ConcurrentHashMap<>();
- LOG.info("containers with one replica threshold count {}", cutOff);
+ initializeRule(containers);
}
@Override
protected TypedEvent<NodeRegistrationContainerReport> getEventType() {
- return SCMEvents.NODE_REGISTRATION_CONT_REPORT;
+ return SCMEvents.CONTAINER_REGISTRATION_REPORT;
}
-
@Override
protected synchronized boolean validate() {
- return getCurrentContainerThreshold() >= safeModeCutoff;
+ return (getCurrentContainerThreshold() >= safeModeCutoff) &&
+ (getCurrentECContainerThreshold() >= safeModeCutoff);
}
@VisibleForTesting
public synchronized double getCurrentContainerThreshold() {
- if (maxContainer == 0) {
+ if (ratisMaxContainer == 0) {
+ return 1;
+ }
+ return (ratisContainerWithMinReplicas.doubleValue() / ratisMaxContainer);
+ }
+
+ @VisibleForTesting
+ public synchronized double getCurrentECContainerThreshold() {
+ if (ecMaxContainer == 0) {
+ return 1;
+ }
+ return (ecContainerWithMinReplicas.doubleValue() / ecMaxContainer);
+ }
+
+ private synchronized double getEcMaxContainer() {
+ if (ecMaxContainer == 0) {
+ return 1;
+ }
+ return ecMaxContainer;
+ }
+
+ private synchronized double getRatisMaxContainer() {
+ if (ratisMaxContainer == 0) {
return 1;
}
- return (containerWithMinReplicas.doubleValue() / maxContainer);
+ return ratisMaxContainer;
}
@Override
protected synchronized void process(
NodeRegistrationContainerReport reportsProto) {
+ DatanodeDetails datanodeDetails = reportsProto.getDatanodeDetails();
+ UUID datanodeUUID = datanodeDetails.getUuid();
+ StorageContainerDatanodeProtocolProtos.ContainerReportsProto report =
reportsProto.getReport();
- reportsProto.getReport().getReportsList().forEach(c -> {
- if (containerMap.containsKey(c.getContainerID())) {
- if (containerMap.remove(c.getContainerID()) != null) {
- containerWithMinReplicas.getAndAdd(1);
- getSafeModeMetrics()
- .incCurrentContainersWithOneReplicaReportedCount();
- }
+ report.getReportsList().forEach(c -> {
+ long containerID = c.getContainerID();
+
+
+ // If it is a Ratis container.
+ if (ratisContainers.contains(containerID)) {
+ recordReportedContainer(containerID, Boolean.FALSE);
+ ratisContainers.remove(containerID);
+ }
+
+ // If it is an EC container.
+ if (ecContainers.contains(containerID)) {
+ putInContainerDNsMap(containerID, ecContainerDNsMap, datanodeUUID);
+ recordReportedContainer(containerID, Boolean.TRUE);
}
});
if (scmInSafeMode()) {
SCMSafeModeManager.getLogger().info(
- "SCM in safe mode. {} % containers have at least one"
- + " reported replica.",
- (containerWithMinReplicas.doubleValue() / maxContainer) * 100);
+ "SCM in safe mode. {} % containers [Ratis] have at least one"
+ + " reported replica, {} % containers [EC] have at N reported
replica.",
+ ((ratisContainerWithMinReplicas.doubleValue() /
getRatisMaxContainer()) * 100),
+ ((ecContainerWithMinReplicas.doubleValue() / getEcMaxContainer()) *
100)
+ );
}
}
+ /**
+ * Record the reported Container.
+ *
+ * We will differentiate and count according to the type of Container.
+ *
+ * @param containerID containerID
+ * @param isEcContainer true, means ECContainer, false, means not
ECContainer.
+ */
+ private void recordReportedContainer(long containerID, boolean
isEcContainer) {
+
+ int uuids = 1;
+ if (isEcContainer && ecContainerDNsMap.containsKey(containerID)) {
+ uuids = ecContainerDNsMap.get(containerID).size();
+ }
+
+ int minReplica = getMinReplica(containerID);
+ if (uuids >= minReplica) {
+ if (isEcContainer) {
+ getSafeModeMetrics()
+ .incCurrentContainersWithECDataReplicaReportedCount();
+ ecContainerWithMinReplicas.getAndAdd(1);
+ } else {
+ ratisContainerWithMinReplicas.getAndAdd(1);
+ getSafeModeMetrics()
+ .incCurrentContainersWithOneReplicaReportedCount();
+ }
+ }
+ }
+
+ /**
+ * Get the minimum replica.
+ *
+ * If it is a Ratis Contianer, the minimum copy is 1.
+ * If it is an EC Container, the minimum copy will be the number of Data in
replicationConfig.
+ *
+ * @param pContainerID containerID
+ * @return MinReplica.
+ */
+ private int getMinReplica(long pContainerID) {
+
+ try {
+ ContainerID containerID = ContainerID.valueOf(pContainerID);
+ ContainerInfo container = containerManager.getContainer(containerID);
+ ReplicationConfig replicationConfig = container.getReplicationConfig();
+ return replicationConfig.getMinimumNodes();
+ } catch (ContainerNotFoundException e) {
+ LOG.error("containerId = {} not found.", pContainerID, e);
+ } catch (Exception e) {
+ LOG.error("containerId = {} not found.", pContainerID, e);
+ }
+
+ return 1;
+ }
+
+ private void putInContainerDNsMap(long containerID, Map<Long, Set<UUID>>
containerDNsMap,
+ UUID datanodeUUID) {
+ containerDNsMap.computeIfAbsent(containerID, key -> Sets.newHashSet());
+ containerDNsMap.get(containerID).add(datanodeUUID);
+ }
+
@Override
protected synchronized void cleanup() {
- containerMap.clear();
+ ratisContainers.clear();
+ ecContainers.clear();
+ ecContainerDNsMap.clear();
}
@Override
public String getStatusText() {
- List<Long> sampleContainers = containerMap.keySet()
- .stream()
- .limit(SAMPLE_CONTAINER_DISPLAY_LIMIT)
- .collect(Collectors.toList());
- String status = String.format("%% of containers with at least one reported"
- + " replica (=%1.2f) >= safeModeCutoff (=%1.2f)",
+ // ratis container
+ String status = String.format(
+ "%1.2f%% of [Ratis] Containers(%s / %s) with at least one reported
replica (=%1.2f) >= " +
+ "safeModeCutoff (=%1.2f);",
+ (ratisContainerWithMinReplicas.doubleValue() / getRatisMaxContainer())
* 100,
+ ratisContainerWithMinReplicas, (long) getRatisMaxContainer(),
getCurrentContainerThreshold(), this.safeModeCutoff);
- if (!sampleContainers.isEmpty()) {
+ Set<Long> sampleRatisContainers = ratisContainers.stream().
+ limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).
+ collect(Collectors.toSet());
+
+ if (!sampleRatisContainers.isEmpty()) {
String sampleContainerText =
- "Sample containers not satisfying the criteria : " +
sampleContainers;
+ "Sample Ratis Containers not satisfying the criteria : " +
sampleRatisContainers + ";";
status = status.concat("\n").concat(sampleContainerText);
}
+ // ec container
+ String ecStatus = String.format(
+ "%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica
(=%1.2f) >= " +
+ "safeModeCutoff (=%1.2f);",
+ (ecContainerWithMinReplicas.doubleValue() / getEcMaxContainer()) * 100,
+ ecContainerWithMinReplicas, (long) getEcMaxContainer(),
+ getCurrentECContainerThreshold(), this.safeModeCutoff);
+ status = status.concat("\n").concat(ecStatus);
+
+ Set<Long> sampleEcContainers = ecContainerDNsMap.entrySet().stream().
+ filter(entry -> {
+ Long containerId = entry.getKey();
+ int minReplica = getMinReplica(containerId);
+ Set<UUID> allReplicas = entry.getValue();
+ if (allReplicas.size() >= minReplica) {
+ return false;
+ }
+ return true;
+ }).
+ map(Map.Entry::getKey).
+ limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).
+ collect(Collectors.toSet());
+
+ if (!sampleEcContainers.isEmpty()) {
+ String sampleECContainerText =
+ "Sample EC Containers not satisfying the criteria : " +
sampleEcContainers + ";";
+ status = status.concat("\n").concat(sampleECContainerText);
+ }
+
return status;
}
@Override
public synchronized void refresh(boolean forceRefresh) {
+ List<ContainerInfo> containers = containerManager.getContainers();
if (forceRefresh) {
- reInitializeRule();
+ initializeRule(containers);
} else {
if (!validate()) {
- reInitializeRule();
+ initializeRule(containers);
}
}
}
- private void reInitializeRule() {
- containerMap.clear();
- containerManager.getContainers().forEach(container -> {
+ private boolean checkContainerState(LifeCycleState state) {
+ if (state == LifeCycleState.QUASI_CLOSED || state ==
LifeCycleState.CLOSED) {
+ return true;
+ }
+ return false;
+ }
+
+ private void initializeRule(List<ContainerInfo> containers) {
+
+ // Clean up the related data in the map.
+ ratisContainers.clear();
+ ecContainers.clear();
+
+ // Iterate through the container list to
+ // get the minimum replica count for each container.
+ containers.forEach(container -> {
// There can be containers in OPEN/CLOSING state which were never
// created by the client. We are not considering these containers for
// now. These containers can be handled by tracking pipelines.
- Optional.ofNullable(container.getState())
- .filter(state -> (state == HddsProtos.LifeCycleState.QUASI_CLOSED ||
- state == HddsProtos.LifeCycleState.CLOSED)
- && container.getNumberOfKeys() > 0)
- .ifPresent(s -> containerMap.put(container.getContainerID(),
- container));
+ LifeCycleState containerState = container.getState();
+ HddsProtos.ReplicationType replicationType =
container.getReplicationType();
+
+ if (checkContainerState(containerState) && container.getNumberOfKeys() >
0) {
+ // If it's of type Ratis
+ if (replicationType.equals(HddsProtos.ReplicationType.RATIS)) {
+ ratisContainers.add(container.getContainerID());
+ }
+
+ // If it's of type EC
+ if (replicationType.equals(HddsProtos.ReplicationType.EC)) {
+ ecContainers.add(container.getContainerID());
+ }
+ }
});
- maxContainer = containerMap.size();
- long cutOff = (long) Math.ceil(maxContainer * safeModeCutoff);
+ ratisMaxContainer = ratisContainers.size();
+ ecMaxContainer = ecContainers.size();
- LOG.info("Refreshed one replica container threshold {}, " +
- "currentThreshold {}", cutOff, containerWithMinReplicas.get());
- getSafeModeMetrics()
- .setNumContainerWithOneReplicaReportedThreshold(cutOff);
- }
+ long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff);
+ long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff);
+
+
getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff);
+
getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff);
+ LOG.info("Refreshed Containers with one replica threshold count {}, " +
+ "with ec n replica threshold count {}.", ratisCutOff, ecCutOff);
+ }
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
index a5ecdb2342..39530de16b 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
@@ -341,6 +341,17 @@ public class SCMSafeModeManager implements SafeModeManager
{
.getCurrentContainerThreshold();
}
+ @VisibleForTesting
+ public double getCurrentECContainerThreshold() {
+ return ((ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE))
+ .getCurrentECContainerThreshold();
+ }
+
+ @VisibleForTesting
+ public ContainerSafeModeRule getContainerSafeModeRule() {
+ return (ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE);
+ }
+
@VisibleForTesting
public HealthyPipelineSafeModeRule getHealthyPipelineSafeModeRule() {
return (HealthyPipelineSafeModeRule)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
index 02bc10ba6e..44c77ac3de 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
@@ -36,8 +36,12 @@ public class SafeModeMetrics {
// These all values will be set to some values when safemode is enabled.
private @Metric MutableGaugeLong
numContainerWithOneReplicaReportedThreshold;
+ private @Metric MutableGaugeLong
+ numContainerWithECDataReplicaReportedThreshold;
private @Metric MutableCounterLong
currentContainersWithOneReplicaReportedCount;
+ private @Metric MutableCounterLong
+ currentContainersWithECDataReplicaReportedCount;
// When hdds.scm.safemode.pipeline-availability.check is set then only
// below metrics will have some values, otherwise they will be zero.
@@ -75,10 +79,18 @@ public class SafeModeMetrics {
this.numContainerWithOneReplicaReportedThreshold.set(val);
}
+ public void setNumContainerWithECDataReplicaReportedThreshold(long val) {
+ this.numContainerWithECDataReplicaReportedThreshold.set(val);
+ }
+
public void incCurrentContainersWithOneReplicaReportedCount() {
this.currentContainersWithOneReplicaReportedCount.incr();
}
+ public void incCurrentContainersWithECDataReplicaReportedCount() {
+ this.currentContainersWithECDataReplicaReportedCount.incr();
+ }
+
MutableGaugeLong getNumHealthyPipelinesThreshold() {
return numHealthyPipelinesThreshold;
}
@@ -100,6 +112,10 @@ public class SafeModeMetrics {
return numContainerWithOneReplicaReportedThreshold;
}
+ MutableGaugeLong getNumContainerWithECDataReplicaReportedThreshold() {
+ return numContainerWithECDataReplicaReportedThreshold;
+ }
+
MutableCounterLong getCurrentContainersWithOneReplicaReportedCount() {
return currentContainersWithOneReplicaReportedCount;
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
index 6f5429a853..5a4dc505d8 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java
@@ -306,12 +306,20 @@ public final class SCMDatanodeHeartbeatDispatcher {
extends ReportFromDatanode<ContainerReportsProto>
implements ContainerReport, IEventInfo {
private long createTime = Time.monotonicNow();
+ // Used to identify whether container reporting is from a registration.
+ private boolean isRegister = false;
public ContainerReportFromDatanode(DatanodeDetails datanodeDetails,
ContainerReportsProto report) {
super(datanodeDetails, report);
}
+ public ContainerReportFromDatanode(DatanodeDetails datanodeDetails,
+ ContainerReportsProto report, boolean isRegister) {
+ super(datanodeDetails, report);
+ this.isRegister = isRegister;
+ }
+
@Override
public boolean equals(Object o) {
return this == o;
@@ -331,6 +339,10 @@ public final class SCMDatanodeHeartbeatDispatcher {
return createTime;
}
+ public boolean isRegister() {
+ return isRegister;
+ }
+
@Override
public String getEventId() {
return getDatanodeDetails().toString() + ", {type: " + getType()
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java
index 105e7ac348..b03f737b34 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java
@@ -251,7 +251,7 @@ public class SCMDatanodeProtocolServer implements
== SCMRegisteredResponseProto.ErrorCode.success) {
eventPublisher.fireEvent(CONTAINER_REPORT,
new SCMDatanodeHeartbeatDispatcher.ContainerReportFromDatanode(
- datanodeDetails, containerReportsProto));
+ datanodeDetails, containerReportsProto, true));
eventPublisher.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
new NodeRegistrationContainerReport(datanodeDetails,
containerReportsProto));
diff --git
a/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm-overview.html
b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm-overview.html
index 2748716e67..7bfe405850 100644
--- a/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm-overview.html
+++ b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm-overview.html
@@ -387,7 +387,7 @@
<tbody>
<tr ng-repeat="typestat in $ctrl.overview.jmx.SafeModeRuleStatus">
<td>{{typestat.key}}</td>
- <td>{{typestat.value[0]}}</td>
+ <td ng-bind-html="formatValue(typestat.value[0])"></td>
<td>{{typestat.value[1]}}</td>
</tr>
</tbody>
diff --git a/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js
b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js
index fc216c0686..eca79852e4 100644
--- a/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js
+++ b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js
@@ -24,7 +24,7 @@
require: {
overview: "^overview"
},
- controller: function ($http,$scope) {
+ controller: function ($http,$scope,$sce) {
var ctrl = this;
$scope.reverse = false;
$scope.columnName = "hostname";
@@ -140,6 +140,14 @@
$scope.lastIndex = Math.ceil(nodeStatusCopy.length /
$scope.RecordsToDisplay);
$scope.nodeStatus = nodeStatusCopy.slice(0,
$scope.RecordsToDisplay);
+ $scope.formatValue = function(value) {
+ if (value && value.includes(';')) {
+ return $sce.trustAsHtml(value.replace('/;/g',
'<br>'));
+ } else {
+ return $sce.trustAsHtml(value);
+ }
+ };
+
ctrl.nodemanagermetrics.NodeStatistics.forEach(({key,
value}) => {
if(key == "Min") {
$scope.statistics.nodes.usages.min = value;
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/HddsTestUtils.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/HddsTestUtils.java
index fe5459764c..787f83e1a8 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/HddsTestUtils.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/HddsTestUtils.java
@@ -828,10 +828,36 @@ public final class HddsTestUtils {
*/
public static List<ContainerInfo> getContainerInfo(int numContainers) {
List<ContainerInfo> containerInfoList = new ArrayList<>();
+ RatisReplicationConfig ratisReplicationConfig =
+ RatisReplicationConfig.getInstance(ReplicationFactor.THREE);
for (int i = 0; i < numContainers; i++) {
ContainerInfo.Builder builder = new ContainerInfo.Builder();
containerInfoList.add(builder
.setContainerID(RandomUtils.nextLong())
+ .setReplicationConfig(ratisReplicationConfig)
+ .build());
+ }
+ return containerInfoList;
+ }
+
+ /**
+ * Generate EC Container data.
+ *
+ * @param numContainers number of ContainerInfo to be included in list.
+ * @param data Data block Num.
+ * @param parity Parity block Num.
+ * @return {@literal List<ContainerInfo>}
+ */
+ public static List<ContainerInfo> getECContainerInfo(int numContainers, int
data, int parity) {
+ List<ContainerInfo> containerInfoList = new ArrayList<>();
+ ECReplicationConfig eCReplicationConfig = new ECReplicationConfig(data,
parity);
+ for (int i = 0; i < numContainers; i++) {
+ ContainerInfo.Builder builder = new ContainerInfo.Builder();
+ containerInfoList.add(builder
+ .setContainerID(RandomUtils.nextLong())
+ .setOwner("test-owner")
+ .setPipelineID(PipelineID.randomId())
+ .setReplicationConfig(eCReplicationConfig)
.build());
}
return containerInfoList;
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
index eedef07944..05e2317765 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
@@ -20,6 +20,7 @@ package org.apache.hadoop.hdds.scm.safemode;
import java.io.File;
import java.io.IOException;
import java.time.Clock;
+import java.time.ZoneId;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.Collections;
@@ -40,7 +41,10 @@ import
org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
import org.apache.hadoop.hdds.scm.HddsTestUtils;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerManager;
+import org.apache.hadoop.hdds.scm.container.ContainerManagerImpl;
import org.apache.hadoop.hdds.scm.container.MockNodeManager;
+import
org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaPendingOps;
import org.apache.hadoop.hdds.scm.events.SCMEvents;
import org.apache.hadoop.hdds.scm.ha.SCMHAManagerStub;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
@@ -53,6 +57,7 @@ import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.scm.pipeline.PipelineProvider;
import org.apache.hadoop.hdds.scm.pipeline.PipelineManagerImpl;
import org.apache.hadoop.hdds.scm.server.SCMDatanodeHeartbeatDispatcher;
+import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer;
import org.apache.hadoop.hdds.server.events.EventHandler;
import org.apache.hadoop.hdds.server.events.EventPublisher;
import org.apache.hadoop.hdds.server.events.EventQueue;
@@ -74,6 +79,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.params.provider.Arguments.arguments;
import static org.mockito.Mockito.mock;
/** Test class for SCMSafeModeManager.
@@ -100,8 +106,7 @@ public class TestSCMSafeModeManager {
config = new OzoneConfiguration();
config.setBoolean(HddsConfigKeys.HDDS_SCM_SAFEMODE_PIPELINE_CREATION,
false);
- config.set(HddsConfigKeys.OZONE_METADATA_DIRS,
- tempDir.getAbsolutePath().toString());
+ config.set(HddsConfigKeys.OZONE_METADATA_DIRS, tempDir.getAbsolutePath());
scmMetadataStore = new SCMMetadataStoreImpl(config);
}
@@ -139,8 +144,10 @@ public class TestSCMSafeModeManager {
assertTrue(scmSafeModeManager.getInSafeMode());
validateRuleStatus("DatanodeSafeModeRule", "registered datanodes 0");
- queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
- HddsTestUtils.createNodeRegistrationContainerReport(containers));
+ SCMDatanodeProtocolServer.NodeRegistrationContainerReport
nodeRegistrationContainerReport =
+ HddsTestUtils.createNodeRegistrationContainerReport(containers);
+ queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
nodeRegistrationContainerReport);
+ queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT,
nodeRegistrationContainerReport);
long cutOff = (long) Math.ceil(numContainers * config.getDouble(
HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
@@ -181,7 +188,7 @@ public class TestSCMSafeModeManager {
assertTrue(scmSafeModeManager.getInSafeMode());
validateRuleStatus("ContainerSafeModeRule",
- "% of containers with at least one reported");
+ "0.00% of [Ratis] Containers(0 / 100) with at least one reported");
testContainerThreshold(containers.subList(0, 25), 0.25);
assertEquals(25, scmSafeModeManager.getSafeModeMetrics()
.getCurrentContainersWithOneReplicaReportedCount().value());
@@ -492,6 +499,79 @@ public class TestSCMSafeModeManager {
100, 1000 * 5);
}
+ // We simulate common EC types: EC-2-2-1024K, EC-3-2-1024K, EC-6-3-1024K.
+ static Stream<Arguments> processECDataParityCombination() {
+ Stream<Arguments> args = Stream.of(arguments(2, 2),
+ arguments(3, 2), arguments(6, 3));
+ return args;
+ }
+
+ @ParameterizedTest
+ @MethodSource("processECDataParityCombination")
+ public void testContainerSafeModeRuleEC(int data, int parity) throws
Exception {
+ containers = new ArrayList<>();
+
+ // We generate 100 EC Containers.
+ containers.addAll(HddsTestUtils.getECContainerInfo(25 * 4, data, parity));
+
+ // Prepare the data for the container.
+ // We have prepared 25 containers in the CLOSED state and 75 containers in
the OPEN state.
+ // Out of the 25 containers, only 20 containers have a NumberOfKeys
greater than 0.
+ for (ContainerInfo container : containers.subList(0, 25)) {
+ container.setState(HddsProtos.LifeCycleState.CLOSED);
+ container.setNumberOfKeys(10);
+ }
+
+ for (ContainerInfo container : containers.subList(25, 100)) {
+ container.setState(HddsProtos.LifeCycleState.OPEN);
+ container.setNumberOfKeys(10);
+ }
+
+ // Set the last 5 closed containers to be empty
+ for (ContainerInfo container : containers.subList(20, 25)) {
+ container.setNumberOfKeys(0);
+ }
+
+ for (ContainerInfo container : containers) {
+ scmMetadataStore.getContainerTable().put(container.containerID(),
container);
+ }
+
+ // Declare SCMSafeModeManager and confirm entry into Safe Mode.
+ EventQueue eventQueue = new EventQueue();
+ MockNodeManager nodeManager = new MockNodeManager(true, 0);
+ PipelineManager pipelineManager = PipelineManagerImpl.newPipelineManager(
+ config,
+ SCMHAManagerStub.getInstance(true),
+ nodeManager,
+ scmMetadataStore.getPipelineTable(),
+ eventQueue,
+ scmContext,
+ serviceManager,
+ Clock.system(ZoneOffset.UTC));
+
+ ContainerManager containerManager = new ContainerManagerImpl(config,
+ SCMHAManagerStub.getInstance(true), null, pipelineManager,
+ scmMetadataStore.getContainerTable(),
+ new ContainerReplicaPendingOps(Clock.system(ZoneId.systemDefault())));
+
+ scmSafeModeManager = new SCMSafeModeManager(
+ config, containers, containerManager, pipelineManager, queue,
serviceManager, scmContext);
+ assertTrue(scmSafeModeManager.getInSafeMode());
+
+ // Only 20 containers are involved in the calculation,
+ // so when 10 containers complete registration, our threshold is 50%.
+ testECContainerThreshold(containers.subList(0, 10), 0.5, data);
+ assertTrue(scmSafeModeManager.getInSafeMode());
+
+ // When the registration of the remaining containers is completed,
+ // the threshold will reach 100%.
+ testECContainerThreshold(containers.subList(10, 20), 1.0, data);
+
+ ContainerSafeModeRule containerSafeModeRule =
+ scmSafeModeManager.getContainerSafeModeRule();
+ assertTrue(containerSafeModeRule.validate());
+ }
+
private void testSafeModeDataNodes(int numOfDns) throws Exception {
OzoneConfiguration conf = new OzoneConfiguration(config);
conf.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, numOfDns);
@@ -504,8 +584,10 @@ public class TestSCMSafeModeManager {
// Register all DataNodes except last one and assert SCM is in safe mode.
for (int i = 0; i < numOfDns - 1; i++) {
- queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
- HddsTestUtils.createNodeRegistrationContainerReport(containers));
+ SCMDatanodeProtocolServer.NodeRegistrationContainerReport
nodeRegistrationContainerReport =
+ HddsTestUtils.createNodeRegistrationContainerReport(containers);
+ queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
nodeRegistrationContainerReport);
+ queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT,
nodeRegistrationContainerReport);
assertTrue(scmSafeModeManager.getInSafeMode());
assertEquals(1, scmSafeModeManager.getCurrentContainerThreshold());
}
@@ -525,14 +607,52 @@ public class TestSCMSafeModeManager {
private void testContainerThreshold(List<ContainerInfo> dnContainers,
double expectedThreshold)
throws Exception {
+ SCMDatanodeProtocolServer.NodeRegistrationContainerReport
nodeRegistrationContainerReport =
+ HddsTestUtils.createNodeRegistrationContainerReport(dnContainers);
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
- HddsTestUtils.createNodeRegistrationContainerReport(dnContainers));
+ nodeRegistrationContainerReport);
+ queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT,
+ nodeRegistrationContainerReport);
GenericTestUtils.waitFor(() -> {
double threshold = scmSafeModeManager.getCurrentContainerThreshold();
return threshold == expectedThreshold;
}, 100, 2000 * 9);
}
+ /**
+ * Test ECContainer reaching SafeMode threshold.
+ *
+ * @param dnContainers
+ * The list of containers that need to reach the threshold.
+ * @param expectedThreshold
+ * The expected threshold.
+ * @param dataBlockNum
+ * The number of data blocks. For EC-3-2-1024K,
+ * we need 3 registration requests to ensure the EC Container is confirmed.
+ * For EC-6-3-1024K, we need 6 registration requests to ensure the EC
Container is confirmed.
+ * @throws Exception The thrown exception message.
+ */
+ private void testECContainerThreshold(List<ContainerInfo> dnContainers,
+ double expectedThreshold, int dataBlockNum) throws Exception {
+
+ // Step1. We need to ensure the number of confirmed EC data blocks
+ // based on the quantity of dataBlockNum.
+ for (int i = 0; i < dataBlockNum; i++) {
+ SCMDatanodeProtocolServer.NodeRegistrationContainerReport
nodeRegistrationContainerReport =
+ HddsTestUtils.createNodeRegistrationContainerReport(dnContainers);
+ queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
+ nodeRegistrationContainerReport);
+ queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT,
+ nodeRegistrationContainerReport);
+ }
+
+ // Step2. Wait for the threshold to be reached.
+ GenericTestUtils.waitFor(() -> {
+ double threshold = scmSafeModeManager.getCurrentECContainerThreshold();
+ return threshold == expectedThreshold;
+ }, 100, 2000 * 9);
+ }
+
@Test
public void testSafeModePipelineExitRule() throws Exception {
containers = new ArrayList<>();
@@ -571,8 +691,11 @@ public class TestSCMSafeModeManager {
config, containers, null, pipelineManager, queue, serviceManager,
scmContext);
- queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
- HddsTestUtils.createNodeRegistrationContainerReport(containers));
+ SCMDatanodeProtocolServer.NodeRegistrationContainerReport
nodeRegistrationContainerReport =
+ HddsTestUtils.createNodeRegistrationContainerReport(containers);
+ queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
nodeRegistrationContainerReport);
+ queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT,
nodeRegistrationContainerReport);
+
assertTrue(scmSafeModeManager.getInSafeMode());
firePipelineEvent(pipelineManager, pipeline);
@@ -629,8 +752,10 @@ public class TestSCMSafeModeManager {
// Register all DataNodes except last one and assert SCM is in safe mode.
for (int i = 0; i < numOfDns - 1; i++) {
- queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
- HddsTestUtils.createNodeRegistrationContainerReport(containers));
+ SCMDatanodeProtocolServer.NodeRegistrationContainerReport
nodeRegistrationContainerReport =
+ HddsTestUtils.createNodeRegistrationContainerReport(containers);
+ queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
nodeRegistrationContainerReport);
+ queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT,
nodeRegistrationContainerReport);
assertTrue(scmSafeModeManager.getInSafeMode());
assertFalse(scmSafeModeManager.getPreCheckComplete());
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]