[ https://issues.apache.org/jira/browse/GEODE-9000?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ernest Burghardt reassigned GEODE-9000: --------------------------------------- Assignee: Ernest Burghardt > NPE During Reconnect After Network Split > ---------------------------------------- > > Key: GEODE-9000 > URL: https://issues.apache.org/jira/browse/GEODE-9000 > Project: Geode > Issue Type: Bug > Components: membership > Affects Versions: 1.14.0 > Reporter: Juan Ramos > Assignee: Ernest Burghardt > Priority: Major > Labels: blocks-1.14.0 > > During a full network split when all members get shutdown by a partition, one > of the servers continually fails to reconnect due to a > {{NullPointerException}}. When using persistent regions, this also prevents > the remaining members from correctly start up as they might be waiting for > the stuck member to recover the latest data. > The issue itself has been introduced by the fix for GEODE-8901, the new > implementation for {{GMSJoinLeave.processNetworkPartitionMessage}} doesn't > have a {{currentView}} installed during the reconnect phase ({{getView() == > null}}) and the following is shown in the logs: > {noformat} > [fatal 2021/03/04 03:32:02.744 GMT gemfire-cluster-server-0 <ReconnectThread> > tid=0x8a] Unexpected exception while booting membership services > java.lang.NullPointerException > at > org.apache.geode.distributed.internal.membership.gms.membership.GMSJoinLeave.processNetworkPartitionMessage(GMSJoinLeave.java:1459) > at > org.apache.geode.distributed.internal.membership.gms.messenger.JGroupsMessenger$JGroupsReceiver.receive(JGroupsMessenger.java:1343) > at > org.apache.geode.distributed.internal.membership.gms.messenger.JGroupsMessenger.started(JGroupsMessenger.java:428) > at > org.apache.geode.distributed.internal.membership.gms.Services.start(Services.java:210) > at > org.apache.geode.distributed.internal.membership.gms.GMSMembership.start(GMSMembership.java:1782) > at > org.apache.geode.distributed.internal.DistributionImpl.start(DistributionImpl.java:171) > at > org.apache.geode.distributed.internal.DistributionImpl.createDistribution(DistributionImpl.java:222) > at > org.apache.geode.distributed.internal.ClusterDistributionManager.<init>(ClusterDistributionManager.java:464) > at > org.apache.geode.distributed.internal.ClusterDistributionManager.<init>(ClusterDistributionManager.java:497) > at > org.apache.geode.distributed.internal.ClusterDistributionManager.create(ClusterDistributionManager.java:326) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.initialize(InternalDistributedSystem.java:779) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.access$200(InternalDistributedSystem.java:135) > at > org.apache.geode.distributed.internal.InternalDistributedSystem$Builder.build(InternalDistributedSystem.java:3034) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.connectInternal(InternalDistributedSystem.java:290) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.reconnect(InternalDistributedSystem.java:2605) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.tryReconnect(InternalDistributedSystem.java:2424) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.disconnect(InternalDistributedSystem.java:1275) > at > org.apache.geode.distributed.internal.ClusterDistributionManager$DMListener.membershipFailure(ClusterDistributionManager.java:2315) > at > org.apache.geode.distributed.internal.membership.gms.GMSMembership.uncleanShutdown(GMSMembership.java:1239) > at > org.apache.geode.distributed.internal.membership.gms.GMSMembership$ManagerImpl.lambda$forceDisconnect$0(GMSMembership.java:1951) > at java.base/java.lang.Thread.run(Thread.java:834) > [error 2021/03/04 03:32:02.747 GMT gemfire-cluster-server-0 <ReconnectThread> > tid=0x8a] Unexpected problem starting up membership services > java.lang.NullPointerException > at > org.apache.geode.distributed.internal.membership.gms.membership.GMSJoinLeave.processNetworkPartitionMessage(GMSJoinLeave.java:1459) > at > org.apache.geode.distributed.internal.membership.gms.messenger.JGroupsMessenger$JGroupsReceiver.receive(JGroupsMessenger.java:1343) > at > org.apache.geode.distributed.internal.membership.gms.messenger.JGroupsMessenger.started(JGroupsMessenger.java:428) > at > org.apache.geode.distributed.internal.membership.gms.Services.start(Services.java:210) > at > org.apache.geode.distributed.internal.membership.gms.GMSMembership.start(GMSMembership.java:1782) > at > org.apache.geode.distributed.internal.DistributionImpl.start(DistributionImpl.java:171) > at > org.apache.geode.distributed.internal.DistributionImpl.createDistribution(DistributionImpl.java:222) > at > org.apache.geode.distributed.internal.ClusterDistributionManager.<init>(ClusterDistributionManager.java:464) > at > org.apache.geode.distributed.internal.ClusterDistributionManager.<init>(ClusterDistributionManager.java:497) > at > org.apache.geode.distributed.internal.ClusterDistributionManager.create(ClusterDistributionManager.java:326) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.initialize(InternalDistributedSystem.java:779) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.access$200(InternalDistributedSystem.java:135) > at > org.apache.geode.distributed.internal.InternalDistributedSystem$Builder.build(InternalDistributedSystem.java:3034) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.connectInternal(InternalDistributedSystem.java:290) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.reconnect(InternalDistributedSystem.java:2605) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.tryReconnect(InternalDistributedSystem.java:2424) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.disconnect(InternalDistributedSystem.java:1275) > at > org.apache.geode.distributed.internal.ClusterDistributionManager$DMListener.membershipFailure(ClusterDistributionManager.java:2315) > at > org.apache.geode.distributed.internal.membership.gms.GMSMembership.uncleanShutdown(GMSMembership.java:1239) > at > org.apache.geode.distributed.internal.membership.gms.GMSMembership$ManagerImpl.lambda$forceDisconnect$0(GMSMembership.java:1951) > at java.base/java.lang.Thread.run(Thread.java:834) > [warn 2021/03/04 03:32:02.748 GMT gemfire-cluster-server-0 <ReconnectThread> > tid=0x8a] Caught SystemConnectException in reconnect > org.apache.geode.SystemConnectException: Problem starting up membership > services: null. Consult log file for more details > at > org.apache.geode.distributed.internal.DistributionImpl.start(DistributionImpl.java:189) > at > org.apache.geode.distributed.internal.DistributionImpl.createDistribution(DistributionImpl.java:222) > at > org.apache.geode.distributed.internal.ClusterDistributionManager.<init>(ClusterDistributionManager.java:464) > at > org.apache.geode.distributed.internal.ClusterDistributionManager.<init>(ClusterDistributionManager.java:497) > at > org.apache.geode.distributed.internal.ClusterDistributionManager.create(ClusterDistributionManager.java:326) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.initialize(InternalDistributedSystem.java:779) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.access$200(InternalDistributedSystem.java:135) > at > org.apache.geode.distributed.internal.InternalDistributedSystem$Builder.build(InternalDistributedSystem.java:3034) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.connectInternal(InternalDistributedSystem.java:290) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.reconnect(InternalDistributedSystem.java:2605) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.tryReconnect(InternalDistributedSystem.java:2424) > at > org.apache.geode.distributed.internal.InternalDistributedSystem.disconnect(InternalDistributedSystem.java:1275) > at > org.apache.geode.distributed.internal.ClusterDistributionManager$DMListener.membershipFailure(ClusterDistributionManager.java:2315) > at > org.apache.geode.distributed.internal.membership.gms.GMSMembership.uncleanShutdown(GMSMembership.java:1239) > at > org.apache.geode.distributed.internal.membership.gms.GMSMembership$ManagerImpl.lambda$forceDisconnect$0(GMSMembership.java:1951) > at java.base/java.lang.Thread.run(Thread.java:834) > [info 2021/03/04 03:32:02.749 GMT gemfire-cluster-server-0 <ReconnectThread> > tid=0x8a] Disconnecting old DistributedSystem to prepare for a reconnect > attempt > {noformat} > The above keeps happening during further reconnect attempts and the server > member can't re-join the distributed system. -- This message was sent by Atlassian Jira (v8.3.4#803005)