This is an automated email from the ASF dual-hosted git repository.
tejaskriya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new e06c19312ac HDDS-13123. Add testing for the `ozone repair om
skip-ratis-transaction` command (#8810)
e06c19312ac is described below
commit e06c19312ac742406b1f4ff07d4395c67dc6e443
Author: Tejaskriya <[email protected]>
AuthorDate: Wed Jul 30 14:03:57 2025 +0530
HDDS-13123. Add testing for the `ozone repair om skip-ratis-transaction`
command (#8810)
---
dev-support/byteman/fail-create-bucket.btm | 28 +++++++++++++
.../compose/ozonesecure-ha/test-repair-tools.sh | 48 ++++++++++++++++++++++
.../repair/ratis-transaction-repair.robot | 35 ++++++++++++++++
3 files changed, 111 insertions(+)
diff --git a/dev-support/byteman/fail-create-bucket.btm
b/dev-support/byteman/fail-create-bucket.btm
new file mode 100644
index 00000000000..f624e45717c
--- /dev/null
+++ b/dev-support/byteman/fail-create-bucket.btm
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# This script instruments ozone manager to fail a CreateBucket request for a
specific name
+#
+
+RULE Crash OM with CreateBucket
+CLASS org.apache.hadoop.ozone.om.request.bucket.OMBucketCreateRequest
+METHOD validateAndUpdateCache
+AT ENTRY
+IF TRUE
+DO
+ traceln("--> crashing CreateBucket request");
+ THROW new RuntimeException("Byteman crashes OM");
+ENDRULE
\ No newline at end of file
diff --git
a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh
b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh
index f181f5c6570..da54e913deb 100644
--- a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh
+++ b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh
@@ -40,6 +40,54 @@ create_data_dirs dn{1..5} kms om{1..3} recon s3g scm{1..3}
start_docker_env
+repair_and_restart_om() {
+ local om_container="$1"
+ local om_id="$2"
+ echo "Waiting for container '${om_container}' to stop..."
+ # Loop until the container is not running
+ timeout=60 # seconds
+ start_time=$(date +%s)
+ while [ "$(docker inspect -f '{{.State.Running}}' "${om_container}"
2>/dev/null)" == "true" ]; do
+ current_time=$(date +%s)
+ elapsed=$((current_time - start_time))
+
+ if [ "$elapsed" -ge "$timeout" ]; then
+ echo "Timeout: Container '${om_container}' did not stop within
${timeout} seconds."
+ exit 1
+ fi
+ sleep 1
+ done
+ echo "Container '${om_container}' has stopped."
+
+ logpath=$(execute_command_in_container ${SCM} bash -c "find / -type f -path
'/*/$om_id/*/log_inprogress_0' 2>/dev/null | head -n 1")
+ echo "Ratis log segment file path: ${logpath}"
+
+ execute_command_in_container ${SCM} bash -c "ozone repair om srt
-b=/opt/hadoop/compose/ozonesecure-ha/data/$om_id/backup1 --index=2
-s=${logpath}"
+ echo "Repair command executed for ${om_id}."
+ docker start "${om_container}"
+ echo "Container '${om_container}' started again."
+ bucketTable=$(execute_command_in_container ${SCM} bash -c "ozone debug ldb
--db=/opt/hadoop/compose/ozonesecure-ha/data/$om_id/metadata/om.db scan
--cf=bucketTable")
+ echo "Bucket table for ${om_id}:"
+ if echo "$bucketTable" | grep -q "bucket-crash-1"; then
+ echo "bucket 'bucket-crash-1' should not have been created, but it is
present in the bucketTable of $om_id"
+ exit 1
+ else
+ echo "bucket 'bucket-crash-1' is not present in the bucketTable of $om_id
as expected."
+ fi
+}
+
+echo "Testing ratis transaction repair on all OMs"
+execute_robot_test ${SCM} kinit.robot
+execute_robot_test ${SCM} repair/ratis-transaction-repair.robot
+repair_and_restart_om "ozonesecure-ha-om1-1" "om1"
+repair_and_restart_om "ozonesecure-ha-om2-1" "om2"
+repair_and_restart_om "ozonesecure-ha-om3-1" "om3"
+if ! execute_command_in_container scm1.org timeout 15s ozone sh volume list
1>/dev/null; then
+ echo "Command timed out or failed => OMs are not running as expected. Test
for repairing ratis transaction failed."
+ exit 1
+fi
+echo "Testing ratis transaction repair completed successfully."
+
execute_robot_test ${OM} kinit.robot
echo "Creating test keys to verify om compaction"
diff --git
a/hadoop-ozone/dist/src/main/smoketest/repair/ratis-transaction-repair.robot
b/hadoop-ozone/dist/src/main/smoketest/repair/ratis-transaction-repair.robot
new file mode 100644
index 00000000000..e3a8d7cd997
--- /dev/null
+++ b/hadoop-ozone/dist/src/main/smoketest/repair/ratis-transaction-repair.robot
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+*** Settings ***
+Documentation Test recovering from OM crash due to transaction failure
+Library OperatingSystem
+Library BuiltIn
+Library Process
+Resource ../lib/os.robot
+Resource ../ozone-fi/BytemanKeywords.robot
+
+*** Variables ***
+${VOLUME} test-txn-vol
+${BAD_BUCKET} bucket-crash-1
+${CRASH_RULE} /opt/hadoop/share/ozone/byteman/fail-create-bucket.btm
+${TIMEOUT} 10 seconds
+
+*** Test Cases ***
+Verify OM crash at bucket create
+ Inject Fault Into OMs Only ${CRASH_RULE}
+ Execute ozone sh volume create o3://${OM_SERVICE_ID}/${VOLUME}
+ Run Process ozone sh bucket create
o3://${OM_SERVICE_ID}/${VOLUME}/${BAD_BUCKET} timeout=${TIMEOUT}
shell=True
+ Remove Fault From OMs Only ${CRASH_RULE}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]