[ https://issues.apache.org/jira/browse/HBASE-28932?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Rushabh Shah updated HBASE-28932: --------------------------------- Description: RS kept on running even if it was unable to write replication marker for 5 minutes. But this issue is not specific to just replication marker. It applies to compaction marker as well as region event marker (like open, close). Sample exception trace: {noformat} 2024-10-09 10:12:21,659 ERROR [regionserver/regionserver-33:60020.Chore.3] regionserver.ReplicationMarkerChore - Exception whil e sync'ing replication tracker edit org.apache.hadoop.hbase.exceptions.TimeoutIOException: Failed to get sync result after 300000 ms for txid=15030132, WAL system stuck? at org.apache.hadoop.hbase.regionserver.wal.SyncFuture.get(SyncFuture.java:171) at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.blockOnSync(AbstractFSWAL.java:876) at org.apache.hadoop.hbase.regionserver.wal.FSHLog.publishSyncThenBlockOnCompletion(FSHLog.java:802) at org.apache.hadoop.hbase.regionserver.wal.FSHLog.doSync(FSHLog.java:836) at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.lambda$sync$3(AbstractFSWAL.java:602) at org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:187) at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.sync(AbstractFSWAL.java:602) at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.sync(AbstractFSWAL.java:592) at org.apache.hadoop.hbase.regionserver.wal.WALUtil.doFullMarkerAppendTransaction(WALUtil.java:169) at org.apache.hadoop.hbase.regionserver.wal.WALUtil.writeMarker(WALUtil.java:146) at org.apache.hadoop.hbase.regionserver.wal.WALUtil.writeReplicationMarkerAndSync(WALUtil.java:230) at org.apache.hadoop.hbase.replication.regionserver.ReplicationMarkerChore.chore(ReplicationMarkerChore.java:99) at org.apache.hadoop.hbase.ScheduledChore.run(ScheduledChore.java:161) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) at org.apache.hadoop.hbase.JitterScheduledThreadPoolExecutorImpl$JitteredRunnableScheduledFuture.run(JitterScheduledThreadPoolExecutorImpl.java:107) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:750) {noformat} was: RS kept on running even if it was unable to write replication marker. But this issue is not specific to just replication marker. It applies to compaction marker as well as region event marker (like open, close). Sample exception trace: {noformat} 2024-10-09 10:12:21,659 ERROR [regionserver/regionserver-33:60020.Chore.3] regionserver.ReplicationMarkerChore - Exception whil e sync'ing replication tracker edit org.apache.hadoop.hbase.exceptions.TimeoutIOException: Failed to get sync result after 300000 ms for txid=15030132, WAL system stuck? at org.apache.hadoop.hbase.regionserver.wal.SyncFuture.get(SyncFuture.java:171) at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.blockOnSync(AbstractFSWAL.java:876) at org.apache.hadoop.hbase.regionserver.wal.FSHLog.publishSyncThenBlockOnCompletion(FSHLog.java:802) at org.apache.hadoop.hbase.regionserver.wal.FSHLog.doSync(FSHLog.java:836) at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.lambda$sync$3(AbstractFSWAL.java:602) at org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:187) at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.sync(AbstractFSWAL.java:602) at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.sync(AbstractFSWAL.java:592) at org.apache.hadoop.hbase.regionserver.wal.WALUtil.doFullMarkerAppendTransaction(WALUtil.java:169) at org.apache.hadoop.hbase.regionserver.wal.WALUtil.writeMarker(WALUtil.java:146) at org.apache.hadoop.hbase.regionserver.wal.WALUtil.writeReplicationMarkerAndSync(WALUtil.java:230) at org.apache.hadoop.hbase.replication.regionserver.ReplicationMarkerChore.chore(ReplicationMarkerChore.java:99) at org.apache.hadoop.hbase.ScheduledChore.run(ScheduledChore.java:161) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) at org.apache.hadoop.hbase.JitterScheduledThreadPoolExecutorImpl$JitteredRunnableScheduledFuture.run(JitterScheduledThreadPoolExecutorImpl.java:107) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:750) {noformat} > Abort RS if unable to sync internal markers. > -------------------------------------------- > > Key: HBASE-28932 > URL: https://issues.apache.org/jira/browse/HBASE-28932 > Project: HBase > Issue Type: Bug > Components: wal > Affects Versions: 2.5.8 > Reporter: Rushabh Shah > Priority: Major > > RS kept on running even if it was unable to write replication marker for 5 > minutes. But this issue is not specific to just replication marker. It > applies to compaction marker as well as region event marker (like open, > close). > Sample exception trace: > {noformat} > 2024-10-09 10:12:21,659 ERROR [regionserver/regionserver-33:60020.Chore.3] > regionserver.ReplicationMarkerChore - Exception whil > e sync'ing replication tracker edit > org.apache.hadoop.hbase.exceptions.TimeoutIOException: Failed to get sync > result after 300000 ms for txid=15030132, WAL system stuck? > at > org.apache.hadoop.hbase.regionserver.wal.SyncFuture.get(SyncFuture.java:171) > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.blockOnSync(AbstractFSWAL.java:876) > at > org.apache.hadoop.hbase.regionserver.wal.FSHLog.publishSyncThenBlockOnCompletion(FSHLog.java:802) > at > org.apache.hadoop.hbase.regionserver.wal.FSHLog.doSync(FSHLog.java:836) > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.lambda$sync$3(AbstractFSWAL.java:602) > at org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:187) > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.sync(AbstractFSWAL.java:602) > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.sync(AbstractFSWAL.java:592) > at > org.apache.hadoop.hbase.regionserver.wal.WALUtil.doFullMarkerAppendTransaction(WALUtil.java:169) > at > org.apache.hadoop.hbase.regionserver.wal.WALUtil.writeMarker(WALUtil.java:146) > at > org.apache.hadoop.hbase.regionserver.wal.WALUtil.writeReplicationMarkerAndSync(WALUtil.java:230) > at > org.apache.hadoop.hbase.replication.regionserver.ReplicationMarkerChore.chore(ReplicationMarkerChore.java:99) > at org.apache.hadoop.hbase.ScheduledChore.run(ScheduledChore.java:161) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294) > at > org.apache.hadoop.hbase.JitterScheduledThreadPoolExecutorImpl$JitteredRunnableScheduledFuture.run(JitterScheduledThreadPoolExecutorImpl.java:107) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:750) > {noformat} -- This message was sent by Atlassian Jira (v8.20.10#820010)