This is an automated email from the ASF dual-hosted git repository.
erose pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new aca0f9ea7a HDDS-13084. Trigger on-demand container scan when a
container moves from open to unhealthy. (#8904)
aca0f9ea7a is described below
commit aca0f9ea7a5a82539101cec8c4333f206d4d85f6
Author: Aswin Shakil Balasubramanian <[email protected]>
AuthorDate: Mon Aug 11 10:07:00 2025 -0700
HDDS-13084. Trigger on-demand container scan when a container moves from
open to unhealthy. (#8904)
---
.../container/common/impl/HddsDispatcher.java | 5 +-
.../ozone/container/keyvalue/KeyValueHandler.java | 2 +-
.../ozone/container/ozoneimpl/OzoneContainer.java | 5 ++
.../apache/hadoop/hdds/scm/TestCloseContainer.java | 5 +-
.../TestContainerScannerIntegrationAbstract.java | 5 ++
.../TestOnDemandContainerScannerIntegration.java | 55 ++++++++++++++++++++++
6 files changed, 71 insertions(+), 6 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
index ff6e199db2..ea47c4945b 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/HddsDispatcher.java
@@ -402,12 +402,13 @@ && getMissingContainerSet().contains(containerID)) {
|| containerState == State.RECOVERING);
// mark and persist the container state to be unhealthy
try {
- // TODO HDDS-7096 + HDDS-8781: Use on demand scanning for the open
- // container instead.
ContainerScanError error = new
ContainerScanError(ContainerScanError.FailureType.WRITE_FAILURE,
new File(container.getContainerData().getContainerPath()),
new StorageContainerException(result));
handler.markContainerUnhealthy(container,
DataScanResult.fromErrors(Collections.singletonList(error)));
+ // For unhealthy containers, trigger an async on-demand scan to
build container merkle tree,
+ // as the metadata-based tree may not be reliable due to potential
data corruption.
+ containerSet.scanContainerWithoutGap(containerID, "Unhealthy
container scan");
LOG.info("Marked Container UNHEALTHY, ContainerID: {}", containerID);
} catch (IOException ioe) {
// just log the error here in case marking the container fails,
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
index bea4aa0af8..aee346ec89 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueHandler.java
@@ -1493,7 +1493,7 @@ private ContainerProtos.ContainerChecksumInfo
updateAndGetContainerChecksum(Cont
public void markContainerUnhealthy(Container container, ScanResult reason)
throws IOException {
container.writeLock();
- long containerID = 0L;
+ long containerID = container.getContainerData().getContainerID();
try {
if (container.getContainerState() == State.UNHEALTHY) {
LOG.debug("Call to mark already unhealthy container {} as unhealthy",
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
index 91816e97a7..e0f255d640 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
@@ -472,6 +472,11 @@ public void resumeContainerScrub() {
backgroundScanners.forEach(AbstractBackgroundContainerScanner::unpause);
}
+ @VisibleForTesting
+ public OnDemandContainerScanner getOnDemandScanner() {
+ return onDemandScanner;
+ }
+
/**
* Starts serving requests to ozone container.
*
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestCloseContainer.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestCloseContainer.java
index 7910b6908c..7cbe2cc65b 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestCloseContainer.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/TestCloseContainer.java
@@ -34,7 +34,6 @@
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
-import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.time.Duration;
@@ -140,7 +139,7 @@ public void
testReplicasAreReportedForClosedContainerAfterRestart()
// Checksum file exists after container close
for (HddsDatanodeService hddsDatanode: hddsDatanodes) {
GenericTestUtils.waitFor(() ->
checkContainerCloseInDatanode(hddsDatanode, container), 100, 5000);
- assertTrue(containerChecksumFileExists(hddsDatanode,
container.getContainerID()));
+ GenericTestUtils.waitFor(() -> containerChecksumFileExists(hddsDatanode,
container.getContainerID()), 100, 5000);
}
long originalSeq = container.getSequenceId();
@@ -197,7 +196,7 @@ public void testCloseClosedContainer()
// Checksum file exists after container close
for (HddsDatanodeService hddsDatanode: hddsDatanodes) {
GenericTestUtils.waitFor(() ->
checkContainerCloseInDatanode(hddsDatanode, container), 100, 5000);
- assertTrue(containerChecksumFileExists(hddsDatanode,
container.getContainerID()));
+ GenericTestUtils.waitFor(() -> containerChecksumFileExists(hddsDatanode,
container.getContainerID()), 100, 5000);
}
for (ContainerReplica replica : getContainerReplicas(container)) {
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
index 086d4b41ef..f99157e7c9 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
@@ -224,4 +224,9 @@ private OzoneOutputStream createKey(String keyName) throws
Exception {
protected OzoneConfiguration getConf() {
return cluster.getConf();
}
+
+ protected HddsDatanodeService getDatanode() {
+ assertEquals(1, cluster.getHddsDatanodes().size());
+ return cluster.getHddsDatanodes().get(0);
+ }
}
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerScannerIntegration.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerScannerIntegration.java
index d27b78aa59..e81b3244a9 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerScannerIntegration.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestOnDemandContainerScannerIntegration.java
@@ -26,20 +26,26 @@
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import java.time.Instant;
import java.util.Collection;
import java.util.EnumSet;
+import java.util.Optional;
import java.util.Set;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
import
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State;
+import org.apache.hadoop.ozone.HddsDatanodeService;
import org.apache.hadoop.ozone.container.common.interfaces.Container;
+import org.apache.hadoop.ozone.container.common.interfaces.ContainerDispatcher;
import org.apache.hadoop.ozone.container.common.utils.ContainerLogger;
import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
import org.apache.hadoop.ozone.container.keyvalue.TestContainerCorruptions;
import
org.apache.hadoop.ozone.container.ozoneimpl.ContainerScannerConfiguration;
import org.apache.hadoop.ozone.container.ozoneimpl.OnDemandContainerScanner;
+import org.apache.hadoop.ozone.container.ozoneimpl.OnDemandScannerMetrics;
import org.apache.ozone.test.GenericTestUtils;
import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;
@@ -182,4 +188,53 @@ void
testCorruptionDetectedForOpenContainers(TestContainerCorruptions corruption
corruption.assertLogged(openContainerID, 1, logCapturer);
}
+ /**
+ * Test that {@link OnDemandContainerScanner} is triggered when the
HddsDispatcher
+ * detects write failures and automatically triggers on-demand scans.
+ */
+ @Test
+ void testOnDemandScanTriggeredByUnhealthyContainer() throws Exception {
+ long containerID = writeDataToOpenContainer();
+ Container<?> container = getDnContainer(containerID);
+ assertEquals(State.OPEN, container.getContainerState());
+
+ Optional<Instant> initialScanTime =
container.getContainerData().lastDataScanTime();
+ HddsDatanodeService dn = getDatanode();
+ ContainerDispatcher dispatcher =
dn.getDatanodeStateMachine().getContainer().getDispatcher();
+ OnDemandScannerMetrics scannerMetrics =
dn.getDatanodeStateMachine().getContainer()
+ .getOnDemandScanner().getMetrics();
+ int initialScannedCount = scannerMetrics.getNumContainersScanned();
+
+ // Create a PutBlock request with malformed block data to trigger internal
error
+ ContainerProtos.ContainerCommandRequestProto writeFailureRequest =
+ ContainerProtos.ContainerCommandRequestProto.newBuilder()
+ .setCmdType(ContainerProtos.Type.PutBlock)
+ .setContainerID(containerID)
+ .setDatanodeUuid(dn.getDatanodeDetails().getUuidString())
+ .setPutBlock(ContainerProtos.PutBlockRequestProto.newBuilder()
+ .setBlockData(ContainerProtos.BlockData.newBuilder()
+ .setBlockID(ContainerProtos.DatanodeBlockID.newBuilder()
+ .setContainerID(containerID)
+ .setLocalID(999L)
+ .setBlockCommitSequenceId(1)
+ .build())
+ .setSize(1024) // Size mismatch with chunks
+ .build())
+ .build())
+ .build();
+
+ ContainerProtos.ContainerCommandResponseProto response =
dispatcher.dispatch(writeFailureRequest, null);
+ assertNotEquals(ContainerProtos.Result.SUCCESS, response.getResult());
+ assertEquals(State.UNHEALTHY, container.getContainerState());
+
+ // The dispatcher should have called containerSet.scanContainerWithoutGap
due to the failure
+ GenericTestUtils.waitFor(() -> {
+ Optional<Instant> currentScanTime =
container.getContainerData().lastDataScanTime();
+ return currentScanTime.isPresent() &&
currentScanTime.get().isAfter(initialScanTime.orElse(Instant.EPOCH));
+ }, 500, 5000);
+
+ int finalScannedCount = scannerMetrics.getNumContainersScanned();
+ assertTrue(finalScannedCount > initialScannedCount);
+ }
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]