This is an automated email from the ASF dual-hosted git repository.
erose pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 4ffae705c7 HDDS-12239. Volume should not be marked as unhealthy when
disk full (#7830)
4ffae705c7 is described below
commit 4ffae705c72124ab11d1941f529c4a8bf30f2012
Author: Ashish Kumar <[email protected]>
AuthorDate: Wed Mar 26 19:38:18 2025 +0530
HDDS-12239. Volume should not be marked as unhealthy when disk full (#7830)
Co-authored-by: ashishk <[email protected]>
---
.../container/common/volume/StorageVolume.java | 17 ++++++++
.../volume/TestStorageVolumeHealthChecks.java | 47 ++++++++++++++++++++++
2 files changed, 64 insertions(+)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
index 318fa0ab9d..639317af88 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java
@@ -623,6 +623,15 @@ public synchronized VolumeCheckResult check(@Nullable
Boolean unused)
return VolumeCheckResult.HEALTHY;
}
+ // At least some space required to check disk read/write
+ // If there are not enough space remaining,
+ // to avoid volume failure we can ignore checking disk read/write
+ int minimumDiskSpace = healthCheckFileSize * 2;
+ if (volumeInfo.get().getCurrentUsage().getAvailable() < minimumDiskSpace) {
+ ioTestSlidingWindow.add(true);
+ return VolumeCheckResult.HEALTHY;
+ }
+
// Since IO errors may be intermittent, volume remains healthy until the
// threshold of failures is crossed.
boolean diskChecksPassed = DiskCheckUtil.checkReadWrite(storageDir,
@@ -634,6 +643,14 @@ public synchronized VolumeCheckResult check(@Nullable
Boolean unused)
" interrupted.");
}
+ // As WRITE keeps happening there is probability, disk has become full
during above check.
+ // We can check again if disk is full. If it is full,
+ // in this case keep volume as healthy so that READ can still be served
+ if (!diskChecksPassed && volumeInfo.get().getCurrentUsage().getAvailable()
< minimumDiskSpace) {
+ ioTestSlidingWindow.add(true);
+ return VolumeCheckResult.HEALTHY;
+ }
+
// Move the sliding window of IO test results forward 1 by adding the
// latest entry and removing the oldest entry from the window.
// Update the failure counter for the new window.
diff --git
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
index eddf80ef42..9e16e4f9b7 100644
---
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
+++
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java
@@ -110,6 +110,53 @@ public boolean checkExistence(File storageDir) {
assertEquals(VolumeCheckResult.FAILED, result);
}
+ @ParameterizedTest
+ @MethodSource("volumeBuilders")
+ public void testVolumeFullHealth(StorageVolume.Builder<?> builder) throws
Exception {
+ verifyFullVolumeHealthWithDiskReadWriteStatus(builder, true, false);
+ }
+
+
+ public void
verifyFullVolumeHealthWithDiskReadWriteStatus(StorageVolume.Builder<?> builder,
boolean... checkResult)
+ throws Exception {
+
+ for (boolean result : checkResult) {
+ StorageVolume volume = builder.build();
+
+ VolumeUsage usage = volume.getVolumeInfo().get().getUsageForTesting();
+ DatanodeConfiguration dnConf =
CONF.getObject(DatanodeConfiguration.class);
+ int minimumDiskSpace = dnConf.getVolumeHealthCheckFileSize() * 2;
+ // Keep remaining space as just less than double of
VolumeHealthCheckFileSize.
+ usage.incrementUsedSpace(usage.getCurrentUsage().getAvailable() -
minimumDiskSpace + 1);
+ usage.realUsage();
+ DiskCheckUtil.DiskChecks ioFailure = new DiskCheckUtil.DiskChecks() {
+ @Override
+ public boolean checkReadWrite(File storageDir, File testFileDir,
+ int numBytesToWrite) {
+ return result;
+ }
+ };
+ DiskCheckUtil.setTestImpl(ioFailure);
+ // Volume will remain healthy as volume don't have enough space to check
READ/WRITE
+ assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+ // Even in second try volume will remain HEALTHY
+ assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+
+ // Now keep enough space for read/write check to go through
+ usage.decrementUsedSpace(minimumDiskSpace + 1);
+
+ // volumeIOFailureTolerance is 1, so first time it will be HEALTHY always
+ assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+ if (result) {
+ // Volume will remain as healthy as READ/WRITE check is fine
+ assertEquals(VolumeCheckResult.HEALTHY, volume.check(false));
+ } else {
+ // Second time volume will fail as READ/WRITE check has failed
+ assertEquals(VolumeCheckResult.FAILED, volume.check(false));
+ }
+ }
+ }
+
@ParameterizedTest
@MethodSource("volumeBuilders")
public void testCheckPermissions(StorageVolume.Builder<?> builder)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]