[ https://issues.apache.org/jira/browse/HBASE-28803?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Nick Dimiduk updated HBASE-28803: --------------------------------- Component/s: master > HBase Master stuck due to improper handling of WALSyncTimeoutException within > UncheckedIOException > -------------------------------------------------------------------------------------------------- > > Key: HBASE-28803 > URL: https://issues.apache.org/jira/browse/HBASE-28803 > Project: HBase > Issue Type: Bug > Components: master, wal > Affects Versions: 2.6.0, 3.0.0-alpha-4 > Reporter: Peter Somogyi > Assignee: Peter Somogyi > Priority: Critical > Labels: pull-request-available > > One of our test clusters stuck during a rolling restart due to a WAL.sync > timeout. This issue did not result in the Master aborting because the > WALSyncTimeoutException was wrapped in an UncheckedIOException, which > prevented the proper exception handling mechanism from being triggered. As a > result, the Master was handing for a long time and procedures were stuck. > This was a 2.4 based HBase with HBASE-27230. > {noformat} > 2024-08-17 17:23:07,567 ERROR > org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore: Failed > to delete pid=2027 > org.apache.hadoop.hbase.regionserver.wal.WALSyncTimeoutIOException: > org.apache.hadoop.hbase.exceptions.TimeoutIOException: Failed to get sync > result after 300000 ms for txid=4347, WAL system stuck? > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.blockOnSync(AbstractFSWAL.java:848) > at > org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.sync(AsyncFSWAL.java:718) > at org.apache.hadoop.hbase.regionserver.HRegion.sync(HRegion.java:8902) > at > org.apache.hadoop.hbase.regionserver.HRegion.doWALAppend(HRegion.java:8469) > at > org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutate(HRegion.java:4523) > at > org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4447) > at > org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4377) > at > org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4853) > at > org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4847) > at > org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4843) > at org.apache.hadoop.hbase.regionserver.HRegion.put(HRegion.java:3155) > at > org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore.lambda$delete$8(RegionProcedureStore.java:379) > at > org.apache.hadoop.hbase.master.region.MasterRegion.update(MasterRegion.java:141) > at > org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore.delete(RegionProcedureStore.java:379) > at > org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore.delete(RegionProcedureStore.java:410) > at > org.apache.hadoop.hbase.procedure2.CompletedProcedureCleaner.periodicExecute(CompletedProcedureCleaner.java:135) > at > org.apache.hadoop.hbase.procedure2.TimeoutExecutorThread.executeInMemoryChore(TimeoutExecutorThread.java:122) > at > org.apache.hadoop.hbase.procedure2.TimeoutExecutorThread.execDelayedProcedure(TimeoutExecutorThread.java:101) > at > org.apache.hadoop.hbase.procedure2.TimeoutExecutorThread.run(TimeoutExecutorThread.java:68) > Caused by: org.apache.hadoop.hbase.exceptions.TimeoutIOException: Failed to > get sync result after 300000 ms for txid=4347, WAL system stuck? > at > org.apache.hadoop.hbase.regionserver.wal.SyncFuture.get(SyncFuture.java:171) > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.blockOnSync(AbstractFSWAL.java:844) > ... 18 more > 2024-08-17 17:23:07,568 ERROR > org.apache.hadoop.hbase.procedure2.TimeoutExecutorThread: Ignoring pid=-1, > state=WAITING_TIMEOUT; > org.apache.hadoop.hbase.procedure2.CompletedProcedureCleaner exception: > org.apache.hadoop.hbase.regionserver.wal.WALSyncTimeoutIOException: > org.apache.hadoop.hbase.exceptions.TimeoutIOException: Failed to get sync > result after 300000 ms for txid=4347, WAL system stuck? > java.io.UncheckedIOException: > org.apache.hadoop.hbase.regionserver.wal.WALSyncTimeoutIOException: > org.apache.hadoop.hbase.exceptions.TimeoutIOException: Failed to get sync > result after 300000 ms for txid=4347, WAL system stuck? > at > org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore.delete(RegionProcedureStore.java:383) > at > org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore.delete(RegionProcedureStore.java:410) > at > org.apache.hadoop.hbase.procedure2.CompletedProcedureCleaner.periodicExecute(CompletedProcedureCleaner.java:135) > at > org.apache.hadoop.hbase.procedure2.TimeoutExecutorThread.executeInMemoryChore(TimeoutExecutorThread.java:122) > at > org.apache.hadoop.hbase.procedure2.TimeoutExecutorThread.execDelayedProcedure(TimeoutExecutorThread.java:101) > at > org.apache.hadoop.hbase.procedure2.TimeoutExecutorThread.run(TimeoutExecutorThread.java:68) > Caused by: > org.apache.hadoop.hbase.regionserver.wal.WALSyncTimeoutIOException: > org.apache.hadoop.hbase.exceptions.TimeoutIOException: Failed to get sync > result after 300000 ms for txid=4347, WAL system stuck? > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.blockOnSync(AbstractFSWAL.java:848) > at > org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.sync(AsyncFSWAL.java:718) > at org.apache.hadoop.hbase.regionserver.HRegion.sync(HRegion.java:8902) > at > org.apache.hadoop.hbase.regionserver.HRegion.doWALAppend(HRegion.java:8469) > at > org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutate(HRegion.java:4523) > at > org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4447) > at > org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4377) > at > org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4853) > at > org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4847) > at > org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4843) > at org.apache.hadoop.hbase.regionserver.HRegion.put(HRegion.java:3155) > at > org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore.lambda$delete$8(RegionProcedureStore.java:379) > at > org.apache.hadoop.hbase.master.region.MasterRegion.update(MasterRegion.java:141) > at > org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore.delete(RegionProcedureStore.java:379) > ... 5 more > Caused by: org.apache.hadoop.hbase.exceptions.TimeoutIOException: Failed to > get sync result after 300000 ms for txid=4347, WAL system stuck? > at > org.apache.hadoop.hbase.regionserver.wal.SyncFuture.get(SyncFuture.java:171) > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.blockOnSync(AbstractFSWAL.java:844) > ... 18 more > 2024-08-17 17:23:07,569 WARN > org.apache.hadoop.hbase.master.assignment.AssignmentManager: STUCK > Region-In-Transition state=OPEN, > location=host-10.example.com,22101,1723906425777, table=OMID_COMMIT_TABLE, > region=1b8c62897ed9e90955e299bfca1e7aa9{noformat} -- This message was sent by Atlassian Jira (v8.20.10#820010)