[ https://issues.apache.org/jira/browse/HBASE-28912?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ke Han updated HBASE-28912: --------------------------- Description: I ran into an NPE exception when starting up the HBase 2.6.0 cluster This bug is non-deterministic. It happens rarely and likely requires certain concurrency control to trigger. h1. Description When starting up 2.6.0 HBase cluster: 1HM, 1RS and a HDFS cluster, some network delay between RS and HDFS cluster could lead to RS aborting with NPE. {code:java} 2024-09-28T07:59:33,183 WARN [RS_OPEN_META-regionserver/hregion1:16020-0] handler.AssignRegionHandler: Failed to open region hbase:meta,,1.1588230740, will report to master org.apache.hadoop.hbase.FailedCloseWALAfterInitializedErrorException: Failed close after init wal failed. at org.apache.hadoop.hbase.wal.AbstractFSWALProvider.getWAL(AbstractFSWALProvider.java:164) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.wal.AbstractFSWALProvider.getWAL(AbstractFSWALProvider.java:59) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.wal.WALFactory.getWAL(WALFactory.java:323) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.HRegionServer.getWAL(HRegionServer.java:2513) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.handler.AssignRegionHandler.process(AssignRegionHandler.java:149) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.executor.EventHandler.run(EventHandler.java:104) ~[hbase-server-2.6.0.jar:2.6.0] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_362] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_362] at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_362] Caused by: java.io.IOException: java.lang.NullPointerException at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.shutdown(AbstractFSWAL.java:1020) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.close(AbstractFSWAL.java:1046) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.wal.AbstractFSWALProvider.getWAL(AbstractFSWALProvider.java:162) ~[hbase-server-2.6.0.jar:2.6.0] ... 8 more Caused by: java.lang.NullPointerException at java.util.concurrent.ConcurrentHashMap.putVal(ConcurrentHashMap.java:1011) ~[?:1.8.0_362] at java.util.concurrent.ConcurrentHashMap.put(ConcurrentHashMap.java:1006) ~[?:1.8.0_362] at org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.closeWriter(AsyncFSWAL.java:718) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.doShutdown(AsyncFSWAL.java:775) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL$2.call(AbstractFSWAL.java:992) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL$2.call(AbstractFSWAL.java:987) ~[hbase-server-2.6.0.jar:2.6.0] at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_362] ... 3 more {code} It cannot open hbase:meta region correctly. {*}The WAL is in a broken state{*}. I tried to start up the HBase in another version (trunk: fdd8fa5) after the network delay between HBase and HDFS was recovered. However, the HMaster would also abort with the following exception {code:java} 2024-09-28T08:57:22,319 ERROR [master/hmaster:16000:becomeActiveMaster] master.HMaster: ***** ABORTING master hmaster,16000,1727513781417: Unhandled exception. Starting shutdown. ***** java.lang.IllegalStateException: Expected the service ClusterSchemaServiceImpl [FAILED] to be RUNNING, but the service has FAILED at org.apache.hbase.thirdparty.com.google.common.util.concurrent.AbstractService.checkCurrentState(AbstractService.java:384) ~[hbase-shaded-miscellaneous-4.1.8.jar:4.1.8] at org.apache.hbase.thirdparty.com.google.common.util.concurrent.AbstractService.awaitRunning(AbstractService.java:324) ~[hbase-shaded-miscellaneous-4.1.8.jar:4.1.8] at org.apache.hadoop.hbase.master.HMaster.initClusterSchemaService(HMaster.java:1565) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.finishActiveMasterInitialization(HMaster.java:1218) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.startActiveMasterManager(HMaster.java:2535) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.lambda$run$0(HMaster.java:613) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.trace.TraceUtil.lambda$tracedRunnable$2(TraceUtil.java:155) ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at java.lang.Thread.run(Thread.java:833) ~[?:?] Caused by: org.apache.hadoop.hbase.TableNotFoundException: hbase:namespace at java.lang.Thread.getStackTrace(Thread.java:1610) ~[?:?] at org.apache.hadoop.hbase.util.FutureUtils.setStackTrace(FutureUtils.java:144) ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.util.FutureUtils.rethrow(FutureUtils.java:163) ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.client.AsyncTableResultScanner.next(AsyncTableResultScanner.java:173) ~[hbase-client-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.TableNamespaceManager.loadFromNamespace(TableNamespaceManager.java:126) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.TableNamespaceManager.loadNamespaceIntoCache(TableNamespaceManager.java:168) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.TableNamespaceManager.start(TableNamespaceManager.java:175) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.ClusterSchemaServiceImpl.doStart(ClusterSchemaServiceImpl.java:61) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hbase.thirdparty.com.google.common.util.concurrent.AbstractService.startAsync(AbstractService.java:252) ~[hbase-shaded-miscellaneous-4.1.8.jar:4.1.8] at org.apache.hadoop.hbase.master.HMaster.initClusterSchemaService(HMaster.java:1562) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.finishActiveMasterInitialization(HMaster.java:1218) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.startActiveMasterManager(HMaster.java:2535) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.lambda$run$0(HMaster.java:613) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.trace.TraceUtil.lambda$tracedRunnable$2(TraceUtil.java:155) ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at java.lang.Thread.run(Thread.java:833) ~[?:?] {code} I need to manually remove all files stored in HDFS and do a fresh restart of HBase to resolve this problem. I attached my HM and RS logs to this ticket. I think a better way could be to avoid writing broken WAL to disk, which likely would block the restart/upgrade. In that case, users do not need to clear the entire disk state manually. was: I ran into an NPE exception when starting up the HBase 2.6.0 cluster This bug is non-deterministic. It happens rarely and likely requires certain concurrency control to trigger. h1. Description When starting up 2.6.0 HBase cluster: 1HM, 1RS and a HDFS cluster, some network delay between RS and HDFS cluster could lead to RS aborting with NPE. {code:java} 2024-09-28T07:59:33,183 WARN [RS_OPEN_META-regionserver/hregion1:16020-0] handler.AssignRegionHandler: Failed to open region hbase:meta,,1.1588230740, will report to master org.apache.hadoop.hbase.FailedCloseWALAfterInitializedErrorException: Failed close after init wal failed. at org.apache.hadoop.hbase.wal.AbstractFSWALProvider.getWAL(AbstractFSWALProvider.java:164) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.wal.AbstractFSWALProvider.getWAL(AbstractFSWALProvider.java:59) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.wal.WALFactory.getWAL(WALFactory.java:323) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.HRegionServer.getWAL(HRegionServer.java:2513) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.handler.AssignRegionHandler.process(AssignRegionHandler.java:149) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.executor.EventHandler.run(EventHandler.java:104) ~[hbase-server-2.6.0.jar:2.6.0] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_362] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_362] at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_362] Caused by: java.io.IOException: java.lang.NullPointerException at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.shutdown(AbstractFSWAL.java:1020) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.close(AbstractFSWAL.java:1046) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.wal.AbstractFSWALProvider.getWAL(AbstractFSWALProvider.java:162) ~[hbase-server-2.6.0.jar:2.6.0] ... 8 more Caused by: java.lang.NullPointerException at java.util.concurrent.ConcurrentHashMap.putVal(ConcurrentHashMap.java:1011) ~[?:1.8.0_362] at java.util.concurrent.ConcurrentHashMap.put(ConcurrentHashMap.java:1006) ~[?:1.8.0_362] at org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.closeWriter(AsyncFSWAL.java:718) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.doShutdown(AsyncFSWAL.java:775) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL$2.call(AbstractFSWAL.java:992) ~[hbase-server-2.6.0.jar:2.6.0] at org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL$2.call(AbstractFSWAL.java:987) ~[hbase-server-2.6.0.jar:2.6.0] at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_362] ... 3 more {code} It cannot open hbase:meta region correctly. {*}The WAL is in a broken state{*}. I tried to start up the HBase in another version (trunk: fdd8fa5) after the network delay between HBase and HDFS was recovered. However, the HMaster would also abort with the following exception {code:java} 2024-09-28T08:57:22,319 ERROR [master/hmaster:16000:becomeActiveMaster] master.HMaster: ***** ABORTING master hmaster,16000,1727513781417: Unhandled exception. Starting shutdown. ***** java.lang.IllegalStateException: Expected the service ClusterSchemaServiceImpl [FAILED] to be RUNNING, but the service has FAILED at org.apache.hbase.thirdparty.com.google.common.util.concurrent.AbstractService.checkCurrentState(AbstractService.java:384) ~[hbase-shaded-miscellaneous-4.1.8.jar:4.1.8] at org.apache.hbase.thirdparty.com.google.common.util.concurrent.AbstractService.awaitRunning(AbstractService.java:324) ~[hbase-shaded-miscellaneous-4.1.8.jar:4.1.8] at org.apache.hadoop.hbase.master.HMaster.initClusterSchemaService(HMaster.java:1565) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.finishActiveMasterInitialization(HMaster.java:1218) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.startActiveMasterManager(HMaster.java:2535) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.lambda$run$0(HMaster.java:613) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.trace.TraceUtil.lambda$tracedRunnable$2(TraceUtil.java:155) ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at java.lang.Thread.run(Thread.java:833) ~[?:?] Caused by: org.apache.hadoop.hbase.TableNotFoundException: hbase:namespace at java.lang.Thread.getStackTrace(Thread.java:1610) ~[?:?] at org.apache.hadoop.hbase.util.FutureUtils.setStackTrace(FutureUtils.java:144) ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.util.FutureUtils.rethrow(FutureUtils.java:163) ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.client.AsyncTableResultScanner.next(AsyncTableResultScanner.java:173) ~[hbase-client-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.TableNamespaceManager.loadFromNamespace(TableNamespaceManager.java:126) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.TableNamespaceManager.loadNamespaceIntoCache(TableNamespaceManager.java:168) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.TableNamespaceManager.start(TableNamespaceManager.java:175) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.ClusterSchemaServiceImpl.doStart(ClusterSchemaServiceImpl.java:61) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hbase.thirdparty.com.google.common.util.concurrent.AbstractService.startAsync(AbstractService.java:252) ~[hbase-shaded-miscellaneous-4.1.8.jar:4.1.8] at org.apache.hadoop.hbase.master.HMaster.initClusterSchemaService(HMaster.java:1562) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.finishActiveMasterInitialization(HMaster.java:1218) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.startActiveMasterManager(HMaster.java:2535) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.master.HMaster.lambda$run$0(HMaster.java:613) ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at org.apache.hadoop.hbase.trace.TraceUtil.lambda$tracedRunnable$2(TraceUtil.java:155) ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] at java.lang.Thread.run(Thread.java:833) ~[?:?] {code} I need to manually remove all files stored in HDFS and do a fresh restart of HBase to resolve this problem. I think a better way could be to avoid writing broken WAL to disk, which likely would block the restart/upgrade. In that case, users do not need to clear the entire disk state manually. > NPE when starting up the HBase 2.6.0 cluster if there's a temporal network > delay with HDFS cluster > -------------------------------------------------------------------------------------------------- > > Key: HBASE-28912 > URL: https://issues.apache.org/jira/browse/HBASE-28912 > Project: HBase > Issue Type: Bug > Components: regionserver, wal > Affects Versions: 2.6.0 > Reporter: Ke Han > Priority: Major > Attachments: hbase--master-877f8202c310.log, > hbase--regionserver-03320f597c20.log > > > I ran into an NPE exception when starting up the HBase 2.6.0 cluster > This bug is non-deterministic. It happens rarely and likely requires certain > concurrency control to trigger. > h1. Description > When starting up 2.6.0 HBase cluster: 1HM, 1RS and a HDFS cluster, some > network delay between RS and HDFS cluster could lead to RS aborting with NPE. > {code:java} > 2024-09-28T07:59:33,183 WARN [RS_OPEN_META-regionserver/hregion1:16020-0] > handler.AssignRegionHandler: Failed to open region hbase:meta,,1.1588230740, > will report to master > org.apache.hadoop.hbase.FailedCloseWALAfterInitializedErrorException: Failed > close after init wal failed. > at > org.apache.hadoop.hbase.wal.AbstractFSWALProvider.getWAL(AbstractFSWALProvider.java:164) > ~[hbase-server-2.6.0.jar:2.6.0] > at > org.apache.hadoop.hbase.wal.AbstractFSWALProvider.getWAL(AbstractFSWALProvider.java:59) > ~[hbase-server-2.6.0.jar:2.6.0] > at org.apache.hadoop.hbase.wal.WALFactory.getWAL(WALFactory.java:323) > ~[hbase-server-2.6.0.jar:2.6.0] > at > org.apache.hadoop.hbase.regionserver.HRegionServer.getWAL(HRegionServer.java:2513) > ~[hbase-server-2.6.0.jar:2.6.0] > at > org.apache.hadoop.hbase.regionserver.handler.AssignRegionHandler.process(AssignRegionHandler.java:149) > ~[hbase-server-2.6.0.jar:2.6.0] > at > org.apache.hadoop.hbase.executor.EventHandler.run(EventHandler.java:104) > ~[hbase-server-2.6.0.jar:2.6.0] > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > ~[?:1.8.0_362] > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > ~[?:1.8.0_362] > at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_362] > Caused by: java.io.IOException: java.lang.NullPointerException > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.shutdown(AbstractFSWAL.java:1020) > ~[hbase-server-2.6.0.jar:2.6.0] > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL.close(AbstractFSWAL.java:1046) > ~[hbase-server-2.6.0.jar:2.6.0] > at > org.apache.hadoop.hbase.wal.AbstractFSWALProvider.getWAL(AbstractFSWALProvider.java:162) > ~[hbase-server-2.6.0.jar:2.6.0] > ... 8 more > Caused by: java.lang.NullPointerException > at > java.util.concurrent.ConcurrentHashMap.putVal(ConcurrentHashMap.java:1011) > ~[?:1.8.0_362] > at > java.util.concurrent.ConcurrentHashMap.put(ConcurrentHashMap.java:1006) > ~[?:1.8.0_362] > at > org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.closeWriter(AsyncFSWAL.java:718) > ~[hbase-server-2.6.0.jar:2.6.0] > at > org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL.doShutdown(AsyncFSWAL.java:775) > ~[hbase-server-2.6.0.jar:2.6.0] > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL$2.call(AbstractFSWAL.java:992) > ~[hbase-server-2.6.0.jar:2.6.0] > at > org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL$2.call(AbstractFSWAL.java:987) > ~[hbase-server-2.6.0.jar:2.6.0] > at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_362] > ... 3 more {code} > It cannot open hbase:meta region correctly. {*}The WAL is in a broken > state{*}. > > I tried to start up the HBase in another version (trunk: fdd8fa5) after the > network delay between HBase and HDFS was recovered. However, the HMaster > would also abort with the following exception > {code:java} > 2024-09-28T08:57:22,319 ERROR [master/hmaster:16000:becomeActiveMaster] > master.HMaster: ***** ABORTING master hmaster,16000,1727513781417: Unhandled > exception. Starting shutdown. ***** > java.lang.IllegalStateException: Expected the service > ClusterSchemaServiceImpl [FAILED] to be RUNNING, but the service has FAILED > at > org.apache.hbase.thirdparty.com.google.common.util.concurrent.AbstractService.checkCurrentState(AbstractService.java:384) > ~[hbase-shaded-miscellaneous-4.1.8.jar:4.1.8] > at > org.apache.hbase.thirdparty.com.google.common.util.concurrent.AbstractService.awaitRunning(AbstractService.java:324) > ~[hbase-shaded-miscellaneous-4.1.8.jar:4.1.8] > at > org.apache.hadoop.hbase.master.HMaster.initClusterSchemaService(HMaster.java:1565) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hadoop.hbase.master.HMaster.finishActiveMasterInitialization(HMaster.java:1218) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hadoop.hbase.master.HMaster.startActiveMasterManager(HMaster.java:2535) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at org.apache.hadoop.hbase.master.HMaster.lambda$run$0(HMaster.java:613) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hadoop.hbase.trace.TraceUtil.lambda$tracedRunnable$2(TraceUtil.java:155) > ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at java.lang.Thread.run(Thread.java:833) ~[?:?] > Caused by: org.apache.hadoop.hbase.TableNotFoundException: hbase:namespace > at java.lang.Thread.getStackTrace(Thread.java:1610) ~[?:?] > at > org.apache.hadoop.hbase.util.FutureUtils.setStackTrace(FutureUtils.java:144) > ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at org.apache.hadoop.hbase.util.FutureUtils.rethrow(FutureUtils.java:163) > ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hadoop.hbase.client.AsyncTableResultScanner.next(AsyncTableResultScanner.java:173) > ~[hbase-client-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hadoop.hbase.master.TableNamespaceManager.loadFromNamespace(TableNamespaceManager.java:126) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hadoop.hbase.master.TableNamespaceManager.loadNamespaceIntoCache(TableNamespaceManager.java:168) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hadoop.hbase.master.TableNamespaceManager.start(TableNamespaceManager.java:175) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hadoop.hbase.master.ClusterSchemaServiceImpl.doStart(ClusterSchemaServiceImpl.java:61) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hbase.thirdparty.com.google.common.util.concurrent.AbstractService.startAsync(AbstractService.java:252) > ~[hbase-shaded-miscellaneous-4.1.8.jar:4.1.8] > at > org.apache.hadoop.hbase.master.HMaster.initClusterSchemaService(HMaster.java:1562) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hadoop.hbase.master.HMaster.finishActiveMasterInitialization(HMaster.java:1218) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hadoop.hbase.master.HMaster.startActiveMasterManager(HMaster.java:2535) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at org.apache.hadoop.hbase.master.HMaster.lambda$run$0(HMaster.java:613) > ~[hbase-server-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at > org.apache.hadoop.hbase.trace.TraceUtil.lambda$tracedRunnable$2(TraceUtil.java:155) > ~[hbase-common-4.0.0-alpha-1-SNAPSHOT.jar:4.0.0-alpha-1-SNAPSHOT] > at java.lang.Thread.run(Thread.java:833) ~[?:?] > {code} > > I need to manually remove all files stored in HDFS and do a fresh restart of > HBase to resolve this problem. I attached my HM and RS logs to this ticket. > I think a better way could be to avoid writing broken WAL to disk, which > likely would block the restart/upgrade. In that case, users do not need to > clear the entire disk state manually. -- This message was sent by Atlassian Jira (v8.20.10#820010)